aboutsummaryrefslogtreecommitdiff
path: root/externals/dynarmic/src
diff options
context:
space:
mode:
Diffstat (limited to 'externals/dynarmic/src')
-rw-r--r--externals/dynarmic/src/dynarmic/CMakeLists.txt497
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp424
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h35
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h30
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp239
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp74
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h45
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp600
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h35
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h30
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp323
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h37
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp91
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/abi.h78
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp342
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/address_space.h136
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h57
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp290
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h181
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp707
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp299
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp107
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp518
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp128
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp166
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp1523
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp801
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp683
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h32
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp409
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp273
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp1889
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp791
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp126
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h51
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp58
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h56
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp40
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h30
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp613
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h376
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h49
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp108
-rw-r--r--externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h56
-rw-r--r--externals/dynarmic/src/dynarmic/backend/block_range_information.cpp43
-rw-r--r--externals/dynarmic/src/dynarmic/backend/block_range_information.h29
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler.h63
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp36
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp293
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c14
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp319
-rw-r--r--externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp14
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp1299
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h147
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp259
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp343
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp208
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h97
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp763
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h144
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp431
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp449
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp116
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h84
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/abi.cpp135
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/abi.h132
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp540
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h205
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/callback.cpp41
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/callback.h63
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp38
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h50
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/constants.h177
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h81
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp415
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h145
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp109
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp154
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp1701
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp2134
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc545
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h388
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp693
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp303
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp81
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp21
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp5929
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp2182
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp340
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp264
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp58
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h28
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/host_feature.h66
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp25
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/hostloc.h147
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h38
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h52
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/oparg.h79
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp94
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/perf_map.h25
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp777
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h188
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h38
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp56
-rw-r--r--externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h37
-rw-r--r--externals/dynarmic/src/dynarmic/common/always_false.h13
-rw-r--r--externals/dynarmic/src/dynarmic/common/atomic.h44
-rw-r--r--externals/dynarmic/src/dynarmic/common/cast_util.h18
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/aes.cpp180
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/aes.h23
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp168
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/crc32.h40
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp54
-rw-r--r--externals/dynarmic/src/dynarmic/common/crypto/sm4.h14
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/fpcr.h209
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/fpsr.h160
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/fused.cpp91
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/fused.h15
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/info.h138
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h47
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op.h17
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp40
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp93
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h17
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp90
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h19
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h17
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp59
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp57
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp100
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp59
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp56
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h16
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp97
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h19
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp105
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h19
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp59
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/process_exception.h24
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp93
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/process_nan.h25
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h27
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp197
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/unpacked.h90
-rw-r--r--externals/dynarmic/src/dynarmic/common/fp/util.h99
-rw-r--r--externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp128
-rw-r--r--externals/dynarmic/src/dynarmic/common/llvm_disassemble.h18
-rw-r--r--externals/dynarmic/src/dynarmic/common/lut_from_list.h37
-rw-r--r--externals/dynarmic/src/dynarmic/common/math_util.cpp68
-rw-r--r--externals/dynarmic/src/dynarmic/common/math_util.h47
-rw-r--r--externals/dynarmic/src/dynarmic/common/memory_pool.cpp44
-rw-r--r--externals/dynarmic/src/dynarmic/common/memory_pool.h45
-rw-r--r--externals/dynarmic/src/dynarmic/common/safe_ops.h115
-rw-r--r--externals/dynarmic/src/dynarmic/common/spin_lock.h17
-rw-r--r--externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp86
-rw-r--r--externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h15
-rw-r--r--externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp76
-rw-r--r--externals/dynarmic/src/dynarmic/common/spin_lock_x64.h15
-rw-r--r--externals/dynarmic/src/dynarmic/common/string_util.h15
-rw-r--r--externals/dynarmic/src/dynarmic/common/u128.cpp142
-rw-r--r--externals/dynarmic/src/dynarmic/common/u128.h99
-rw-r--r--externals/dynarmic/src/dynarmic/common/variant_util.h29
-rw-r--r--externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp57
-rw-r--r--externals/dynarmic/src/dynarmic/common/x64_disassemble.h21
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h191
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/ITState.h64
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/PSR.h224
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp442
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h116
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp21
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h162
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp60
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h177
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h74
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc316
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h78
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc172
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h39
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc98
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h38
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc292
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h58
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc71
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h17
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp1584
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp397
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp27
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h52
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp82
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h33
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp88
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp95
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp44
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp110
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h982
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp375
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp93
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp113
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp973
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp767
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp188
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp349
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp27
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp166
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp1116
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp40
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp229
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp69
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp957
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp179
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp604
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp46
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp537
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp89
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp285
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp121
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp439
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp995
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp88
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp126
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp75
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp244
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp232
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp206
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp253
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp198
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp138
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp292
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp149
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp134
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp222
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp157
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp312
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp521
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp223
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp1489
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp115
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h35
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp227
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp265
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h102
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp16
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h115
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp33
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h142
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h83
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc1029
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp69
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h59
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp128
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp22
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp330
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp155
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp62
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp58
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp76
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp236
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp100
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp24
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp139
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp58
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp38
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp35
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp24
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp114
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp199
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp186
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp66
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp145
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp409
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h1084
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp209
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp55
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp134
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp22
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp249
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp154
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp160
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp149
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp224
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp59
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp220
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp46
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp158
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp52
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp102
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp27
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp130
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp113
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp80
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp354
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp430
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp331
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp168
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp149
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp300
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp402
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp40
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp256
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp1240
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp174
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp848
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp408
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp51
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp31
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp161
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp46
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp62
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h190
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h76
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/imm.cpp67
-rw-r--r--externals/dynarmic/src/dynarmic/frontend/imm.h168
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/a32.h109
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/arch_version.h23
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/config.h246
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h110
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h31
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A32/disassembler.h18
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A64/a64.h137
-rw-r--r--externals/dynarmic/src/dynarmic/interface/A64/config.h297
-rw-r--r--externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h88
-rw-r--r--externals/dynarmic/src/dynarmic/interface/halt_reason.h54
-rw-r--r--externals/dynarmic/src/dynarmic/interface/optimization_flags.h81
-rw-r--r--externals/dynarmic/src/dynarmic/ir/acc_type.h29
-rw-r--r--externals/dynarmic/src/dynarmic/ir/basic_block.cpp244
-rw-r--r--externals/dynarmic/src/dynarmic/ir/basic_block.h168
-rw-r--r--externals/dynarmic/src/dynarmic/ir/cond.h31
-rw-r--r--externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp2890
-rw-r--r--externals/dynarmic/src/dynarmic/ir/ir_emitter.h432
-rw-r--r--externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp16
-rw-r--r--externals/dynarmic/src/dynarmic/ir/location_descriptor.h64
-rw-r--r--externals/dynarmic/src/dynarmic/ir/microinstruction.cpp712
-rw-r--r--externals/dynarmic/src/dynarmic/ir/microinstruction.h162
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opcodes.cpp71
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opcodes.h54
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opcodes.inc770
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp70
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp377
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp57
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp161
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp54
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp556
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp23
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp44
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h127
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp18
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/passes.h47
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp218
-rw-r--r--externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp47
-rw-r--r--externals/dynarmic/src/dynarmic/ir/terminal.h131
-rw-r--r--externals/dynarmic/src/dynarmic/ir/type.cpp46
-rw-r--r--externals/dynarmic/src/dynarmic/ir/type.h60
-rw-r--r--externals/dynarmic/src/dynarmic/ir/value.cpp254
-rw-r--r--externals/dynarmic/src/dynarmic/ir/value.h187
359 files changed, 84627 insertions, 0 deletions
diff --git a/externals/dynarmic/src/dynarmic/CMakeLists.txt b/externals/dynarmic/src/dynarmic/CMakeLists.txt
new file mode 100644
index 0000000000..0d6068af6a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt
@@ -0,0 +1,497 @@
+include(TargetArchitectureSpecificSources)
+
+add_library(dynarmic
+ backend/block_range_information.cpp
+ backend/block_range_information.h
+ backend/exception_handler.h
+ common/always_false.h
+ common/cast_util.h
+ common/crypto/aes.cpp
+ common/crypto/aes.h
+ common/crypto/crc32.cpp
+ common/crypto/crc32.h
+ common/crypto/sm4.cpp
+ common/crypto/sm4.h
+ common/fp/fpcr.h
+ common/fp/fpsr.h
+ common/fp/fused.cpp
+ common/fp/fused.h
+ common/fp/info.h
+ common/fp/mantissa_util.h
+ common/fp/op.h
+ common/fp/op/FPCompare.cpp
+ common/fp/op/FPCompare.h
+ common/fp/op/FPConvert.cpp
+ common/fp/op/FPConvert.h
+ common/fp/op/FPMulAdd.cpp
+ common/fp/op/FPMulAdd.h
+ common/fp/op/FPNeg.h
+ common/fp/op/FPRecipEstimate.cpp
+ common/fp/op/FPRecipEstimate.h
+ common/fp/op/FPRecipExponent.cpp
+ common/fp/op/FPRecipExponent.h
+ common/fp/op/FPRecipStepFused.cpp
+ common/fp/op/FPRecipStepFused.h
+ common/fp/op/FPRoundInt.cpp
+ common/fp/op/FPRoundInt.h
+ common/fp/op/FPRSqrtEstimate.cpp
+ common/fp/op/FPRSqrtEstimate.h
+ common/fp/op/FPRSqrtStepFused.cpp
+ common/fp/op/FPRSqrtStepFused.h
+ common/fp/op/FPToFixed.cpp
+ common/fp/op/FPToFixed.h
+ common/fp/process_exception.cpp
+ common/fp/process_exception.h
+ common/fp/process_nan.cpp
+ common/fp/process_nan.h
+ common/fp/rounding_mode.h
+ common/fp/unpacked.cpp
+ common/fp/unpacked.h
+ common/fp/util.h
+ common/llvm_disassemble.cpp
+ common/llvm_disassemble.h
+ common/lut_from_list.h
+ common/math_util.cpp
+ common/math_util.h
+ common/memory_pool.cpp
+ common/memory_pool.h
+ common/safe_ops.h
+ common/spin_lock.h
+ common/string_util.h
+ common/u128.cpp
+ common/u128.h
+ common/variant_util.h
+ frontend/A32/a32_types.cpp
+ frontend/A32/a32_types.h
+ frontend/A64/a64_types.cpp
+ frontend/A64/a64_types.h
+ frontend/decoder/decoder_detail.h
+ frontend/decoder/matcher.h
+ frontend/imm.cpp
+ frontend/imm.h
+ interface/exclusive_monitor.h
+ interface/optimization_flags.h
+ ir/acc_type.h
+ ir/basic_block.cpp
+ ir/basic_block.h
+ ir/cond.h
+ ir/ir_emitter.cpp
+ ir/ir_emitter.h
+ ir/location_descriptor.cpp
+ ir/location_descriptor.h
+ ir/microinstruction.cpp
+ ir/microinstruction.h
+ ir/opcodes.cpp
+ ir/opcodes.h
+ ir/opcodes.inc
+ ir/opt/constant_propagation_pass.cpp
+ ir/opt/dead_code_elimination_pass.cpp
+ ir/opt/identity_removal_pass.cpp
+ ir/opt/ir_matcher.h
+ ir/opt/naming_pass.cpp
+ ir/opt/passes.h
+ ir/opt/polyfill_pass.cpp
+ ir/opt/verification_pass.cpp
+ ir/terminal.h
+ ir/type.cpp
+ ir/type.h
+ ir/value.cpp
+ ir/value.h
+)
+
+if ("A32" IN_LIST DYNARMIC_FRONTENDS)
+ target_sources(dynarmic PRIVATE
+ frontend/A32/a32_ir_emitter.cpp
+ frontend/A32/a32_ir_emitter.h
+ frontend/A32/a32_location_descriptor.cpp
+ frontend/A32/a32_location_descriptor.h
+ frontend/A32/decoder/arm.h
+ frontend/A32/decoder/arm.inc
+ frontend/A32/decoder/asimd.h
+ frontend/A32/decoder/asimd.inc
+ frontend/A32/decoder/thumb16.h
+ frontend/A32/decoder/thumb16.inc
+ frontend/A32/decoder/thumb32.h
+ frontend/A32/decoder/thumb32.inc
+ frontend/A32/decoder/vfp.h
+ frontend/A32/decoder/vfp.inc
+ frontend/A32/disassembler/disassembler.h
+ frontend/A32/disassembler/disassembler_arm.cpp
+ frontend/A32/disassembler/disassembler_thumb.cpp
+ frontend/A32/FPSCR.h
+ frontend/A32/ITState.h
+ frontend/A32/PSR.h
+ frontend/A32/translate/a32_translate.cpp
+ frontend/A32/translate/a32_translate.h
+ frontend/A32/translate/conditional_state.cpp
+ frontend/A32/translate/conditional_state.h
+ frontend/A32/translate/impl/a32_branch.cpp
+ frontend/A32/translate/impl/a32_crc32.cpp
+ frontend/A32/translate/impl/a32_exception_generating.cpp
+ frontend/A32/translate/impl/a32_translate_impl.cpp
+ frontend/A32/translate/impl/a32_translate_impl.h
+ frontend/A32/translate/impl/asimd_load_store_structures.cpp
+ frontend/A32/translate/impl/asimd_misc.cpp
+ frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp
+ frontend/A32/translate/impl/asimd_three_regs.cpp
+ frontend/A32/translate/impl/asimd_two_regs_misc.cpp
+ frontend/A32/translate/impl/asimd_two_regs_scalar.cpp
+ frontend/A32/translate/impl/asimd_two_regs_shift.cpp
+ frontend/A32/translate/impl/barrier.cpp
+ frontend/A32/translate/impl/coprocessor.cpp
+ frontend/A32/translate/impl/data_processing.cpp
+ frontend/A32/translate/impl/divide.cpp
+ frontend/A32/translate/impl/extension.cpp
+ frontend/A32/translate/impl/hint.cpp
+ frontend/A32/translate/impl/load_store.cpp
+ frontend/A32/translate/impl/misc.cpp
+ frontend/A32/translate/impl/multiply.cpp
+ frontend/A32/translate/impl/packing.cpp
+ frontend/A32/translate/impl/parallel.cpp
+ frontend/A32/translate/impl/reversal.cpp
+ frontend/A32/translate/impl/saturated.cpp
+ frontend/A32/translate/impl/status_register_access.cpp
+ frontend/A32/translate/impl/synchronization.cpp
+ frontend/A32/translate/impl/thumb16.cpp
+ frontend/A32/translate/impl/thumb32_branch.cpp
+ frontend/A32/translate/impl/thumb32_control.cpp
+ frontend/A32/translate/impl/thumb32_coprocessor.cpp
+ frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp
+ frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp
+ frontend/A32/translate/impl/thumb32_data_processing_register.cpp
+ frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp
+ frontend/A32/translate/impl/thumb32_load_byte.cpp
+ frontend/A32/translate/impl/thumb32_load_halfword.cpp
+ frontend/A32/translate/impl/thumb32_load_store_dual.cpp
+ frontend/A32/translate/impl/thumb32_load_store_multiple.cpp
+ frontend/A32/translate/impl/thumb32_load_word.cpp
+ frontend/A32/translate/impl/thumb32_long_multiply.cpp
+ frontend/A32/translate/impl/thumb32_misc.cpp
+ frontend/A32/translate/impl/thumb32_multiply.cpp
+ frontend/A32/translate/impl/thumb32_parallel.cpp
+ frontend/A32/translate/impl/thumb32_store_single_data_item.cpp
+ frontend/A32/translate/impl/vfp.cpp
+ frontend/A32/translate/translate_arm.cpp
+ frontend/A32/translate/translate_thumb.cpp
+ interface/A32/a32.h
+ interface/A32/arch_version.h
+ interface/A32/config.h
+ interface/A32/coprocessor.h
+ interface/A32/coprocessor_util.h
+ interface/A32/disassembler.h
+ ir/opt/a32_constant_memory_reads_pass.cpp
+ ir/opt/a32_get_set_elimination_pass.cpp
+ )
+endif()
+
+if ("A64" IN_LIST DYNARMIC_FRONTENDS)
+ target_sources(dynarmic PRIVATE
+ frontend/A64/a64_ir_emitter.cpp
+ frontend/A64/a64_ir_emitter.h
+ frontend/A64/a64_location_descriptor.cpp
+ frontend/A64/a64_location_descriptor.h
+ frontend/A64/decoder/a64.h
+ frontend/A64/decoder/a64.inc
+ frontend/A64/translate/a64_translate.cpp
+ frontend/A64/translate/a64_translate.h
+ frontend/A64/translate/impl/a64_branch.cpp
+ frontend/A64/translate/impl/a64_exception_generating.cpp
+ frontend/A64/translate/impl/data_processing_addsub.cpp
+ frontend/A64/translate/impl/data_processing_bitfield.cpp
+ frontend/A64/translate/impl/data_processing_conditional_compare.cpp
+ frontend/A64/translate/impl/data_processing_conditional_select.cpp
+ frontend/A64/translate/impl/data_processing_crc32.cpp
+ frontend/A64/translate/impl/data_processing_logical.cpp
+ frontend/A64/translate/impl/data_processing_multiply.cpp
+ frontend/A64/translate/impl/data_processing_pcrel.cpp
+ frontend/A64/translate/impl/data_processing_register.cpp
+ frontend/A64/translate/impl/data_processing_shift.cpp
+ frontend/A64/translate/impl/floating_point_compare.cpp
+ frontend/A64/translate/impl/floating_point_conditional_compare.cpp
+ frontend/A64/translate/impl/floating_point_conditional_select.cpp
+ frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp
+ frontend/A64/translate/impl/floating_point_conversion_integer.cpp
+ frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp
+ frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp
+ frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp
+ frontend/A64/translate/impl/impl.cpp
+ frontend/A64/translate/impl/impl.h
+ frontend/A64/translate/impl/load_store_exclusive.cpp
+ frontend/A64/translate/impl/load_store_load_literal.cpp
+ frontend/A64/translate/impl/load_store_multiple_structures.cpp
+ frontend/A64/translate/impl/load_store_no_allocate_pair.cpp
+ frontend/A64/translate/impl/load_store_register_immediate.cpp
+ frontend/A64/translate/impl/load_store_register_pair.cpp
+ frontend/A64/translate/impl/load_store_register_register_offset.cpp
+ frontend/A64/translate/impl/load_store_register_unprivileged.cpp
+ frontend/A64/translate/impl/load_store_single_structure.cpp
+ frontend/A64/translate/impl/move_wide.cpp
+ frontend/A64/translate/impl/simd_across_lanes.cpp
+ frontend/A64/translate/impl/simd_aes.cpp
+ frontend/A64/translate/impl/simd_copy.cpp
+ frontend/A64/translate/impl/simd_crypto_four_register.cpp
+ frontend/A64/translate/impl/simd_crypto_three_register.cpp
+ frontend/A64/translate/impl/simd_extract.cpp
+ frontend/A64/translate/impl/simd_modified_immediate.cpp
+ frontend/A64/translate/impl/simd_permute.cpp
+ frontend/A64/translate/impl/simd_scalar_pairwise.cpp
+ frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp
+ frontend/A64/translate/impl/simd_scalar_three_same.cpp
+ frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp
+ frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp
+ frontend/A64/translate/impl/simd_sha.cpp
+ frontend/A64/translate/impl/simd_sha512.cpp
+ frontend/A64/translate/impl/simd_shift_by_immediate.cpp
+ frontend/A64/translate/impl/simd_table_lookup.cpp
+ frontend/A64/translate/impl/simd_three_different.cpp
+ frontend/A64/translate/impl/simd_three_same.cpp
+ frontend/A64/translate/impl/simd_three_same_extra.cpp
+ frontend/A64/translate/impl/simd_two_register_misc.cpp
+ frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
+ frontend/A64/translate/impl/sys_dc.cpp
+ frontend/A64/translate/impl/sys_ic.cpp
+ frontend/A64/translate/impl/system.cpp
+ frontend/A64/translate/impl/system_flag_format.cpp
+ frontend/A64/translate/impl/system_flag_manipulation.cpp
+ interface/A64/a64.h
+ interface/A64/config.h
+ ir/opt/a64_callback_config_pass.cpp
+ ir/opt/a64_get_set_elimination_pass.cpp
+ ir/opt/a64_merge_interpret_blocks.cpp
+ )
+endif()
+
+if ("x86_64" IN_LIST ARCHITECTURE)
+ target_link_libraries(dynarmic
+ PRIVATE
+ xbyak::xbyak
+ Zydis::Zydis
+ )
+
+ target_architecture_specific_sources(dynarmic "x86_64"
+ backend/x64/abi.cpp
+ backend/x64/abi.h
+ backend/x64/block_of_code.cpp
+ backend/x64/block_of_code.h
+ backend/x64/callback.cpp
+ backend/x64/callback.h
+ backend/x64/constant_pool.cpp
+ backend/x64/constant_pool.h
+ backend/x64/constants.h
+ backend/x64/devirtualize.h
+ backend/x64/emit_x64.cpp
+ backend/x64/emit_x64.h
+ backend/x64/emit_x64_aes.cpp
+ backend/x64/emit_x64_crc32.cpp
+ backend/x64/emit_x64_data_processing.cpp
+ backend/x64/emit_x64_floating_point.cpp
+ backend/x64/emit_x64_memory.cpp.inc
+ backend/x64/emit_x64_memory.h
+ backend/x64/emit_x64_packed.cpp
+ backend/x64/emit_x64_saturation.cpp
+ backend/x64/emit_x64_sha.cpp
+ backend/x64/emit_x64_sm4.cpp
+ backend/x64/emit_x64_vector.cpp
+ backend/x64/emit_x64_vector_floating_point.cpp
+ backend/x64/emit_x64_vector_saturation.cpp
+ backend/x64/exclusive_monitor.cpp
+ backend/x64/exclusive_monitor_friend.h
+ backend/x64/host_feature.h
+ backend/x64/hostloc.cpp
+ backend/x64/hostloc.h
+ backend/x64/jitstate_info.h
+ backend/x64/oparg.h
+ backend/x64/perf_map.cpp
+ backend/x64/perf_map.h
+ backend/x64/reg_alloc.cpp
+ backend/x64/reg_alloc.h
+ backend/x64/stack_layout.h
+ backend/x64/verbose_debugging_output.cpp
+ backend/x64/verbose_debugging_output.h
+ common/spin_lock_x64.cpp
+ common/spin_lock_x64.h
+ common/x64_disassemble.cpp
+ common/x64_disassemble.h
+ )
+
+ if ("A32" IN_LIST DYNARMIC_FRONTENDS)
+ target_architecture_specific_sources(dynarmic "x86_64"
+ backend/x64/a32_emit_x64.cpp
+ backend/x64/a32_emit_x64.h
+ backend/x64/a32_emit_x64_memory.cpp
+ backend/x64/a32_interface.cpp
+ backend/x64/a32_jitstate.cpp
+ backend/x64/a32_jitstate.h
+ )
+ endif()
+
+ if ("A64" IN_LIST DYNARMIC_FRONTENDS)
+ target_architecture_specific_sources(dynarmic "x86_64"
+ backend/x64/a64_emit_x64.cpp
+ backend/x64/a64_emit_x64.h
+ backend/x64/a64_emit_x64_memory.cpp
+ backend/x64/a64_interface.cpp
+ backend/x64/a64_jitstate.cpp
+ backend/x64/a64_jitstate.h
+ )
+ endif()
+endif()
+
+if ("arm64" IN_LIST ARCHITECTURE)
+ target_link_libraries(dynarmic PRIVATE merry::oaknut)
+
+ target_architecture_specific_sources(dynarmic "arm64"
+ backend/arm64/a32_jitstate.cpp
+ backend/arm64/a32_jitstate.h
+ backend/arm64/a64_jitstate.h
+ backend/arm64/abi.cpp
+ backend/arm64/abi.h
+ backend/arm64/address_space.cpp
+ backend/arm64/address_space.h
+ backend/arm64/devirtualize.h
+ backend/arm64/emit_arm64.cpp
+ backend/arm64/emit_arm64.h
+ backend/arm64/emit_arm64_a32.cpp
+ backend/arm64/emit_arm64_a32_coprocessor.cpp
+ backend/arm64/emit_arm64_a32_memory.cpp
+ backend/arm64/emit_arm64_a64.cpp
+ backend/arm64/emit_arm64_a64_memory.cpp
+ backend/arm64/emit_arm64_cryptography.cpp
+ backend/arm64/emit_arm64_data_processing.cpp
+ backend/arm64/emit_arm64_floating_point.cpp
+ backend/arm64/emit_arm64_memory.cpp
+ backend/arm64/emit_arm64_memory.h
+ backend/arm64/emit_arm64_packed.cpp
+ backend/arm64/emit_arm64_saturation.cpp
+ backend/arm64/emit_arm64_vector.cpp
+ backend/arm64/emit_arm64_vector_floating_point.cpp
+ backend/arm64/emit_arm64_vector_saturation.cpp
+ backend/arm64/emit_context.h
+ backend/arm64/exclusive_monitor.cpp
+ backend/arm64/fastmem.h
+ backend/arm64/fpsr_manager.cpp
+ backend/arm64/fpsr_manager.h
+ backend/arm64/reg_alloc.cpp
+ backend/arm64/reg_alloc.h
+ backend/arm64/stack_layout.h
+ backend/arm64/verbose_debugging_output.cpp
+ backend/arm64/verbose_debugging_output.h
+ common/spin_lock_arm64.cpp
+ common/spin_lock_arm64.h
+ )
+
+ if ("A32" IN_LIST DYNARMIC_FRONTENDS)
+ target_architecture_specific_sources(dynarmic "arm64"
+ backend/arm64/a32_address_space.cpp
+ backend/arm64/a32_address_space.h
+ backend/arm64/a32_core.h
+ backend/arm64/a32_interface.cpp
+ )
+ endif()
+
+ if ("A64" IN_LIST DYNARMIC_FRONTENDS)
+ target_architecture_specific_sources(dynarmic "arm64"
+ backend/arm64/a64_address_space.cpp
+ backend/arm64/a64_address_space.h
+ backend/arm64/a64_core.h
+ backend/arm64/a64_interface.cpp
+ )
+ endif()
+endif()
+
+if (WIN32)
+ target_sources(dynarmic PRIVATE backend/exception_handler_windows.cpp)
+elseif (APPLE)
+ find_path(MACH_EXC_DEFS_DIR "mach/mach_exc.defs")
+ if (NOT MACH_EXC_DEFS_DIR)
+ message(WARNING "macOS fastmem disabled: unable to find mach/mach_exc.defs")
+ target_sources(dynarmic PRIVATE backend/exception_handler_generic.cpp)
+ else()
+ message(STATUS "mach/mach_exc.defs location: ${MACH_EXC_DEFS_DIR}")
+ execute_process(
+ COMMAND
+ mkdir -p "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig"
+ COMMAND
+ mig
+ -arch x86_64
+ -user "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_user.c"
+ -header "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_user.h"
+ -server "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_server.c"
+ -sheader "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_server.h"
+ "${MACH_EXC_DEFS_DIR}/mach/mach_exc.defs"
+ )
+ execute_process(
+ COMMAND
+ mkdir -p "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig"
+ COMMAND
+ mig
+ -arch arm64
+ -user "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_user.c"
+ -header "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_user.h"
+ -server "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_server.c"
+ -sheader "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_server.h"
+ "${MACH_EXC_DEFS_DIR}/mach/mach_exc.defs"
+ )
+ target_sources(dynarmic PRIVATE
+ backend/exception_handler_macos.cpp
+ backend/exception_handler_macos_mig.c
+ )
+ endif()
+elseif (UNIX)
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ target_link_libraries(dynarmic PRIVATE rt)
+ endif()
+ target_sources(dynarmic PRIVATE backend/exception_handler_posix.cpp)
+else()
+ target_sources(dynarmic PRIVATE backend/exception_handler_generic.cpp)
+endif()
+
+include(CreateDirectoryGroups)
+create_target_directory_groups(dynarmic)
+
+target_include_directories(dynarmic PUBLIC
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
+ $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+set_target_properties(dynarmic PROPERTIES
+ VERSION ${dynarmic_VERSION}
+ SOVERSION ${dynarmic_VERSION_MAJOR}.${dynarmic_VERSION_MINOR}
+)
+target_compile_options(dynarmic PRIVATE ${DYNARMIC_CXX_FLAGS})
+target_link_libraries(dynarmic
+ PRIVATE
+ Boost::boost
+ fmt::fmt
+ merry::mcl
+ tsl::robin_map
+)
+if (DYNARMIC_USE_LLVM)
+ target_include_directories(dynarmic PRIVATE ${LLVM_INCLUDE_DIRS})
+ target_compile_definitions(dynarmic PRIVATE DYNARMIC_USE_LLVM=1 ${LLVM_DEFINITIONS})
+ llvm_config(dynarmic USE_SHARED armdesc armdisassembler aarch64desc aarch64disassembler x86desc x86disassembler)
+endif()
+if (DYNARMIC_ENABLE_CPU_FEATURE_DETECTION)
+ target_compile_definitions(dynarmic PRIVATE DYNARMIC_ENABLE_CPU_FEATURE_DETECTION=1)
+endif()
+if (DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT)
+ target_compile_definitions(dynarmic PRIVATE DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT=1)
+endif()
+if (DYNARMIC_IGNORE_ASSERTS)
+ target_compile_definitions(dynarmic PRIVATE MCL_IGNORE_ASSERTS=1)
+endif()
+if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
+ target_compile_definitions(dynarmic PRIVATE FMT_USE_WINDOWS_H=0)
+endif()
+target_compile_definitions(dynarmic PRIVATE FMT_USE_USER_DEFINED_LITERALS=1)
+
+if (DYNARMIC_USE_PRECOMPILED_HEADERS)
+ set(PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/ir/ir_emitter.h>")
+ if ("x86_64" IN_LIST ARCHITECTURE)
+ list(PREPEND PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:<xbyak/xbyak.h$<ANGLE-R>>")
+ endif()
+ if ("arm64" IN_LIST ARCHITECTURE)
+ list(PREPEND PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:<oaknut/oaknut.hpp$<ANGLE-R>>")
+ endif()
+ target_precompile_headers(dynarmic PRIVATE ${PRECOMPILED_HEADERS})
+ set(CMAKE_PCH_INSTANTIATE_TEMPLATES ON)
+endif()
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp
new file mode 100644
index 0000000000..cafc47909f
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp
@@ -0,0 +1,424 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/a32_address_space.h"
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/devirtualize.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/stack_layout.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+template<auto mfp, typename T>
+static void* EmitCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto mfp, typename T>
+static void* EmitWrappedReadCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Xscratch0);
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.MOV(Xscratch0, X0);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto callback, typename T>
+static void* EmitExclusiveReadCallTrampoline(oaknut::CodeGenerator& code, const A32::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A32::UserConfig& conf, A32::VAddr vaddr) -> T {
+ return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T {
+ return (conf.callbacks->*callback)(vaddr);
+ });
+ };
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+template<auto mfp, typename T>
+static void* EmitWrappedWriteCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE;
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.MOV(X2, Xscratch1);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto callback, typename T>
+static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const A32::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A32::UserConfig& conf, A32::VAddr vaddr, T value) -> u32 {
+ return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr,
+ [&](T expected) -> bool {
+ return (conf.callbacks->*callback)(vaddr, value, expected);
+ })
+ ? 0
+ : 1;
+ };
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+A32AddressSpace::A32AddressSpace(const A32::UserConfig& conf)
+ : AddressSpace(conf.code_cache_size)
+ , conf(conf) {
+ EmitPrelude();
+}
+
+IR::Block A32AddressSpace::GenerateIR(IR::LocationDescriptor descriptor) const {
+ IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions});
+
+ Optimization::PolyfillPass(ir_block, {});
+ Optimization::NamingPass(ir_block);
+ if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
+ Optimization::A32GetSetElimination(ir_block, {.convert_nzc_to_nz = true});
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::ConstProp)) {
+ Optimization::A32ConstantMemoryReads(ir_block, conf.callbacks);
+ Optimization::ConstantPropagation(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ Optimization::IdentityRemovalPass(ir_block);
+ Optimization::VerificationPass(ir_block);
+
+ return ir_block;
+}
+
+void A32AddressSpace::InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges) {
+ InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
+}
+
+void A32AddressSpace::EmitPrelude() {
+ using namespace oaknut::util;
+
+ UnprotectCodeMemory();
+
+ prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks);
+ prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks);
+ prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks);
+ prelude_info.read_memory_64 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead64>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_8 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_16 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_32 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_64 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead64>(code, conf.callbacks);
+ prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead8, u8>(code, conf);
+ prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead16, u16>(code, conf);
+ prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead32, u32>(code, conf);
+ prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead64, u64>(code, conf);
+ prelude_info.write_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite8>(code, conf.callbacks);
+ prelude_info.write_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite16>(code, conf.callbacks);
+ prelude_info.write_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite32>(code, conf.callbacks);
+ prelude_info.write_memory_64 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite64>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_8 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite8>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_16 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite16>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_32 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite32>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_64 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite64>(code, conf.callbacks);
+ prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf);
+ prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf);
+ prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf);
+ prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf);
+ prelude_info.call_svc = EmitCallTrampoline<&A32::UserCallbacks::CallSVC>(code, conf.callbacks);
+ prelude_info.exception_raised = EmitCallTrampoline<&A32::UserCallbacks::ExceptionRaised>(code, conf.callbacks);
+ prelude_info.isb_raised = EmitCallTrampoline<&A32::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks);
+ prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks);
+ prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks);
+
+ oaknut::Label return_from_run_code, l_return_to_dispatcher;
+
+ prelude_info.run_code = code.xptr<PreludeInfo::RunCodeFuncType>();
+ {
+ ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+
+ code.MOV(X19, X0);
+ code.MOV(Xstate, X1);
+ code.MOV(Xhalt, X2);
+ if (conf.page_table) {
+ code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+
+ if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ code.LDR(Xscratch0, l_return_to_dispatcher);
+ for (size_t i = 0; i < RSBCount; i++) {
+ code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry));
+ }
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.BL(prelude_info.get_ticks_remaining);
+ code.MOV(Xticks, X0);
+ code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
+ }
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.AND(Wscratch0, Wscratch0, 0xffff0000);
+ code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
+ code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+
+ code.BR(X19);
+ }
+
+ prelude_info.step_code = code.xptr<PreludeInfo::RunCodeFuncType>();
+ {
+ ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+
+ code.MOV(X19, X0);
+ code.MOV(Xstate, X1);
+ code.MOV(Xhalt, X2);
+ if (conf.page_table) {
+ code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+
+ if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ code.LDR(Xscratch0, l_return_to_dispatcher);
+ for (size_t i = 0; i < RSBCount; i++) {
+ code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry));
+ }
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.MOV(Xticks, 1);
+ code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
+ }
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.AND(Wscratch0, Wscratch0, 0xffff0000);
+ code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
+ code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ oaknut::Label step_hr_loop;
+ code.l(step_hr_loop);
+ code.LDAXR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+ code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
+ code.STLXR(Wscratch1, Wscratch0, Xhalt);
+ code.CBNZ(Wscratch1, step_hr_loop);
+
+ code.BR(X19);
+ }
+
+ prelude_info.return_to_dispatcher = code.xptr<void*>();
+ {
+ oaknut::Label l_this, l_addr;
+
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+
+ if (conf.enable_cycle_counting) {
+ code.CMP(Xticks, 0);
+ code.B(LE, return_from_run_code);
+ }
+
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xstate);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.BR(X0);
+
+ const auto fn = [](A32AddressSpace& self, A32JitState& context) -> CodePtr {
+ return self.GetOrEmit(context.GetLocationDescriptor());
+ };
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(this));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+ }
+
+ prelude_info.return_from_run_code = code.xptr<void*>();
+ {
+ code.l(return_from_run_code);
+
+ if (conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ code.BL(prelude_info.add_ticks);
+ }
+
+ code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ oaknut::Label exit_hr_loop;
+ code.l(exit_hr_loop);
+ code.LDAXR(W0, Xhalt);
+ code.STLXR(Wscratch0, WZR, Xhalt);
+ code.CBNZ(Wscratch0, exit_hr_loop);
+
+ ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+ code.RET();
+ }
+
+ code.align(8);
+ code.l(l_return_to_dispatcher);
+ code.dx(mcl::bit_cast<u64>(prelude_info.return_to_dispatcher));
+
+ prelude_info.end_of_prelude = code.offset();
+
+ mem.invalidate_all();
+ ProtectCodeMemory();
+}
+
+EmitConfig A32AddressSpace::GetEmitConfig() {
+ return EmitConfig{
+ .optimizations = conf.unsafe_optimizations ? conf.optimizations : conf.optimizations & all_safe_optimizations,
+
+ .hook_isb = conf.hook_isb,
+
+ .cntfreq_el0{},
+ .ctr_el0{},
+ .dczid_el0{},
+ .tpidrro_el0{},
+ .tpidr_el0{},
+
+ .check_halt_on_memory_access = conf.check_halt_on_memory_access,
+
+ .page_table_pointer = mcl::bit_cast<u64>(conf.page_table),
+ .page_table_address_space_bits = 32,
+ .page_table_pointer_mask_bits = conf.page_table_pointer_mask_bits,
+ .silently_mirror_page_table = true,
+ .absolute_offset_page_table = conf.absolute_offset_page_table,
+ .detect_misaligned_access_via_page_table = conf.detect_misaligned_access_via_page_table,
+ .only_detect_misalignment_via_page_table_on_page_boundary = conf.only_detect_misalignment_via_page_table_on_page_boundary,
+
+ .fastmem_pointer = mcl::bit_cast<u64>(conf.fastmem_pointer),
+ .recompile_on_fastmem_failure = conf.recompile_on_fastmem_failure,
+ .fastmem_address_space_bits = 32,
+ .silently_mirror_fastmem = true,
+
+ .wall_clock_cntpct = conf.wall_clock_cntpct,
+ .enable_cycle_counting = conf.enable_cycle_counting,
+
+ .always_little_endian = conf.always_little_endian,
+
+ .descriptor_to_fpcr = [](const IR::LocationDescriptor& location) { return FP::FPCR{A32::LocationDescriptor{location}.FPSCR().Value()}; },
+ .emit_cond = EmitA32Cond,
+ .emit_condition_failed_terminal = EmitA32ConditionFailedTerminal,
+ .emit_terminal = EmitA32Terminal,
+ .emit_check_memory_abort = EmitA32CheckMemoryAbort,
+
+ .state_nzcv_offset = offsetof(A32JitState, cpsr_nzcv),
+ .state_fpsr_offset = offsetof(A32JitState, fpsr),
+ .state_exclusive_state_offset = offsetof(A32JitState, exclusive_state),
+
+ .coprocessors = conf.coprocessors,
+
+ .very_verbose_debugging_output = conf.very_verbose_debugging_output,
+ };
+}
+
+void A32AddressSpace::RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo&) {
+ const A32::LocationDescriptor descriptor{block.Location()};
+ const A32::LocationDescriptor end_location{block.EndLocation()};
+ const auto range = boost::icl::discrete_interval<u32>::closed(descriptor.PC(), end_location.PC() - 1);
+ block_ranges.AddRange(range, descriptor);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h
new file mode 100644
index 0000000000..3b9e9fb2f6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h
@@ -0,0 +1,35 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/backend/arm64/address_space.h"
+#include "dynarmic/backend/block_range_information.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmittedBlockInfo;
+
+class A32AddressSpace final : public AddressSpace {
+public:
+ explicit A32AddressSpace(const A32::UserConfig& conf);
+
+ IR::Block GenerateIR(IR::LocationDescriptor) const override;
+
+ void InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges);
+
+protected:
+ friend class A32Core;
+
+ void EmitPrelude();
+ EmitConfig GetEmitConfig() override;
+ void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) override;
+
+ const A32::UserConfig conf;
+ BlockRangeInformation<u32> block_ranges;
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h
new file mode 100644
index 0000000000..86f6f284d8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h
@@ -0,0 +1,30 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/backend/arm64/a32_address_space.h"
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+class A32Core final {
+public:
+ explicit A32Core(const A32::UserConfig&) {}
+
+ HaltReason Run(A32AddressSpace& process, A32JitState& thread_ctx, volatile u32* halt_reason) {
+ const auto location_descriptor = thread_ctx.GetLocationDescriptor();
+ const auto entry_point = process.GetOrEmit(location_descriptor);
+ return process.prelude_info.run_code(entry_point, &thread_ctx, halt_reason);
+ }
+
+ HaltReason Step(A32AddressSpace& process, A32JitState& thread_ctx, volatile u32* halt_reason) {
+ const auto location_descriptor = A32::LocationDescriptor{thread_ctx.GetLocationDescriptor()}.SetSingleStepping(true);
+ const auto entry_point = process.GetOrEmit(location_descriptor);
+ return process.prelude_info.step_code(entry_point, &thread_ctx, halt_reason);
+ }
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp
new file mode 100644
index 0000000000..6b38c41093
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp
@@ -0,0 +1,239 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <memory>
+#include <mutex>
+
+#include <boost/icl/interval_set.hpp>
+#include <mcl/assert.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/arm64/a32_address_space.h"
+#include "dynarmic/backend/arm64/a32_core.h"
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/common/atomic.h"
+#include "dynarmic/interface/A32/a32.h"
+
+namespace Dynarmic::A32 {
+
+using namespace Backend::Arm64;
+
+struct Jit::Impl final {
+ Impl(Jit* jit_interface, A32::UserConfig conf)
+ : jit_interface(jit_interface)
+ , conf(conf)
+ , current_address_space(conf)
+ , core(conf) {}
+
+ HaltReason Run() {
+ ASSERT(!jit_interface->is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason)));
+
+ jit_interface->is_executing = true;
+ SCOPE_EXIT {
+ jit_interface->is_executing = false;
+ };
+
+ HaltReason hr = core.Run(current_address_space, current_state, &halt_reason);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ HaltReason Step() {
+ ASSERT(!jit_interface->is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason)));
+
+ jit_interface->is_executing = true;
+ SCOPE_EXIT {
+ jit_interface->is_executing = false;
+ };
+
+ HaltReason hr = core.Step(current_address_space, current_state, &halt_reason);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ void ClearCache() {
+ std::unique_lock lock{invalidation_mutex};
+ invalidate_entire_cache = true;
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void InvalidateCacheRange(std::uint32_t start_address, std::size_t length) {
+ std::unique_lock lock{invalidation_mutex};
+ invalid_cache_ranges.add(boost::icl::discrete_interval<u32>::closed(start_address, static_cast<u32>(start_address + length - 1)));
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void Reset() {
+ current_state = {};
+ }
+
+ void HaltExecution(HaltReason hr) {
+ Atomic::Or(&halt_reason, static_cast<u32>(hr));
+ Atomic::Barrier();
+ }
+
+ void ClearHalt(HaltReason hr) {
+ Atomic::And(&halt_reason, ~static_cast<u32>(hr));
+ Atomic::Barrier();
+ }
+
+ std::array<std::uint32_t, 16>& Regs() {
+ return current_state.regs;
+ }
+
+ const std::array<std::uint32_t, 16>& Regs() const {
+ return current_state.regs;
+ }
+
+ std::array<std::uint32_t, 64>& ExtRegs() {
+ return current_state.ext_regs;
+ }
+
+ const std::array<std::uint32_t, 64>& ExtRegs() const {
+ return current_state.ext_regs;
+ }
+
+ std::uint32_t Cpsr() const {
+ return current_state.Cpsr();
+ }
+
+ void SetCpsr(std::uint32_t value) {
+ current_state.SetCpsr(value);
+ }
+
+ std::uint32_t Fpscr() const {
+ return current_state.Fpscr();
+ }
+
+ void SetFpscr(std::uint32_t value) {
+ current_state.SetFpscr(value);
+ }
+
+ void ClearExclusiveState() {
+ current_state.exclusive_state = false;
+ }
+
+ void DumpDisassembly() const {
+ ASSERT_FALSE("Unimplemented");
+ }
+
+private:
+ void PerformRequestedCacheInvalidation(HaltReason hr) {
+ if (Has(hr, HaltReason::CacheInvalidation)) {
+ std::unique_lock lock{invalidation_mutex};
+
+ ClearHalt(HaltReason::CacheInvalidation);
+
+ if (invalidate_entire_cache) {
+ current_address_space.ClearCache();
+
+ invalidate_entire_cache = false;
+ invalid_cache_ranges.clear();
+ return;
+ }
+
+ if (!invalid_cache_ranges.empty()) {
+ current_address_space.InvalidateCacheRanges(invalid_cache_ranges);
+
+ invalid_cache_ranges.clear();
+ return;
+ }
+ }
+ }
+
+ Jit* jit_interface;
+ A32::UserConfig conf;
+ A32JitState current_state{};
+ A32AddressSpace current_address_space;
+ A32Core core;
+
+ volatile u32 halt_reason = 0;
+
+ std::mutex invalidation_mutex;
+ boost::icl::interval_set<u32> invalid_cache_ranges;
+ bool invalidate_entire_cache = false;
+};
+
+Jit::Jit(UserConfig conf)
+ : impl(std::make_unique<Impl>(this, conf)) {}
+
+Jit::~Jit() = default;
+
+HaltReason Jit::Run() {
+ return impl->Run();
+}
+
+HaltReason Jit::Step() {
+ return impl->Step();
+}
+
+void Jit::ClearCache() {
+ impl->ClearCache();
+}
+
+void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) {
+ impl->InvalidateCacheRange(start_address, length);
+}
+
+void Jit::Reset() {
+ impl->Reset();
+}
+
+void Jit::HaltExecution(HaltReason hr) {
+ impl->HaltExecution(hr);
+}
+
+void Jit::ClearHalt(HaltReason hr) {
+ impl->ClearHalt(hr);
+}
+
+std::array<std::uint32_t, 16>& Jit::Regs() {
+ return impl->Regs();
+}
+
+const std::array<std::uint32_t, 16>& Jit::Regs() const {
+ return impl->Regs();
+}
+
+std::array<std::uint32_t, 64>& Jit::ExtRegs() {
+ return impl->ExtRegs();
+}
+
+const std::array<std::uint32_t, 64>& Jit::ExtRegs() const {
+ return impl->ExtRegs();
+}
+
+std::uint32_t Jit::Cpsr() const {
+ return impl->Cpsr();
+}
+
+void Jit::SetCpsr(std::uint32_t value) {
+ impl->SetCpsr(value);
+}
+
+std::uint32_t Jit::Fpscr() const {
+ return impl->Fpscr();
+}
+
+void Jit::SetFpscr(std::uint32_t value) {
+ impl->SetFpscr(value);
+}
+
+void Jit::ClearExclusiveState() {
+ impl->ClearExclusiveState();
+}
+
+void Jit::DumpDisassembly() const {
+ impl->DumpDisassembly();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp
new file mode 100644
index 0000000000..e24654c7db
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp
@@ -0,0 +1,74 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::Arm64 {
+
+u32 A32JitState::Cpsr() const {
+ u32 cpsr = 0;
+
+ // NZCV flags
+ cpsr |= cpsr_nzcv;
+ // Q flag
+ cpsr |= cpsr_q;
+ // GE flags
+ cpsr |= mcl::bit::get_bit<31>(cpsr_ge) ? 1 << 19 : 0;
+ cpsr |= mcl::bit::get_bit<23>(cpsr_ge) ? 1 << 18 : 0;
+ cpsr |= mcl::bit::get_bit<15>(cpsr_ge) ? 1 << 17 : 0;
+ cpsr |= mcl::bit::get_bit<7>(cpsr_ge) ? 1 << 16 : 0;
+ // E flag, T flag
+ cpsr |= mcl::bit::get_bit<1>(upper_location_descriptor) ? 1 << 9 : 0;
+ cpsr |= mcl::bit::get_bit<0>(upper_location_descriptor) ? 1 << 5 : 0;
+ // IT state
+ cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000);
+ cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17;
+ // Other flags
+ cpsr |= cpsr_jaifm;
+
+ return cpsr;
+}
+
+void A32JitState::SetCpsr(u32 cpsr) {
+ // NZCV flags
+ cpsr_nzcv = cpsr & 0xF0000000;
+ // Q flag
+ cpsr_q = cpsr & (1 << 27);
+ // GE flags
+ cpsr_ge = 0;
+ cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0;
+ cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0;
+ cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0;
+ cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0;
+
+ upper_location_descriptor &= 0xFFFF0000;
+ // E flag, T flag
+ upper_location_descriptor |= mcl::bit::get_bit<9>(cpsr) ? 2 : 0;
+ upper_location_descriptor |= mcl::bit::get_bit<5>(cpsr) ? 1 : 0;
+ // IT state
+ upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000;
+ upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000;
+
+ // Other flags
+ cpsr_jaifm = cpsr & 0x010001DF;
+}
+
+constexpr u32 FPCR_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK;
+constexpr u32 FPSR_MASK = 0x0800'009f;
+
+u32 A32JitState::Fpscr() const {
+ return (upper_location_descriptor & 0xffff'0000) | fpsr | fpsr_nzcv;
+}
+
+void A32JitState::SetFpscr(u32 fpscr) {
+ fpsr_nzcv = fpscr & 0xf000'0000;
+ fpsr = fpscr & FPSR_MASK;
+ upper_location_descriptor = (upper_location_descriptor & 0x0000'ffff) | (fpscr & FPCR_MASK);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h
new file mode 100644
index 0000000000..978bf84ad2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h
@@ -0,0 +1,45 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct A32JitState {
+ u32 cpsr_nzcv = 0;
+ u32 cpsr_q = 0;
+ u32 cpsr_jaifm = 0;
+ u32 cpsr_ge = 0;
+
+ u32 fpsr = 0;
+ u32 fpsr_nzcv = 0;
+
+ std::array<u32, 16> regs{};
+
+ u32 upper_location_descriptor;
+
+ alignas(16) std::array<u32, 64> ext_regs{};
+
+ u32 exclusive_state = 0;
+
+ u32 Cpsr() const;
+ void SetCpsr(u32 cpsr);
+
+ u32 Fpscr() const;
+ void SetFpscr(u32 fpscr);
+
+ IR::LocationDescriptor GetLocationDescriptor() const {
+ return IR::LocationDescriptor{regs[15] | (static_cast<u64>(upper_location_descriptor) << 32)};
+ }
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp
new file mode 100644
index 0000000000..f2e20c3d69
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp
@@ -0,0 +1,600 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/a64_address_space.h"
+
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/devirtualize.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/stack_layout.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/translate/a64_translate.h"
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+template<auto mfp, typename T>
+static void* EmitCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto mfp, typename T>
+static void* EmitWrappedReadCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Xscratch0);
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.MOV(Xscratch0, X0);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto callback, typename T>
+static void* EmitExclusiveReadCallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> T {
+ return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T {
+ return (conf.callbacks->*callback)(vaddr);
+ });
+ };
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+template<auto mfp, typename T>
+static void* EmitWrappedWriteCallTrampoline(oaknut::CodeGenerator& code, T* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<mfp>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE;
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.MOV(X2, Xscratch1);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+template<auto callback, typename T>
+static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, T value) -> u32 {
+ return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr,
+ [&](T expected) -> bool {
+ return (conf.callbacks->*callback)(vaddr, value, expected);
+ })
+ ? 0
+ : 1;
+ };
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+static void* EmitRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), 0);
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.FMOV(D0, X0);
+ code.FMOV(V0.D()[1], X1);
+ ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+static void* EmitWrappedRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Q0);
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.FMOV(D0, X0);
+ code.FMOV(V0.D()[1], X1);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+static void* EmitExclusiveRead128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> Vector {
+ return conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector {
+ return conf.callbacks->MemoryRead128(vaddr);
+ });
+ };
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), 0);
+ code.LDR(X0, l_this);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.FMOV(D0, X0);
+ code.FMOV(V0.D()[1], X1);
+ ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+static void* EmitWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.FMOV(X2, D0);
+ code.FMOV(X3, V0.D()[1]);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+static void* EmitWrappedWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) {
+ using namespace oaknut::util;
+
+ const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_);
+
+ oaknut::Label l_addr, l_this;
+
+ constexpr u64 save_regs = ABI_CALLER_SAVE;
+
+ void* target = code.xptr<void*>();
+ ABI_PushRegisters(code, save_regs, 0);
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xscratch0);
+ code.FMOV(X2, D0);
+ code.FMOV(X3, V0.D()[1]);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ ABI_PopRegisters(code, save_regs, 0);
+ code.RET();
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(info.this_ptr);
+ code.l(l_addr);
+ code.dx(info.fn_ptr);
+
+ return target;
+}
+
+static void* EmitExclusiveWrite128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) {
+ using namespace oaknut::util;
+
+ oaknut::Label l_addr, l_this;
+
+ auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, Vector value) -> u32 {
+ return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr,
+ [&](Vector expected) -> bool {
+ return conf.callbacks->MemoryWriteExclusive128(vaddr, value, expected);
+ })
+ ? 0
+ : 1;
+ };
+
+ void* target = code.xptr<void*>();
+ code.LDR(X0, l_this);
+ code.FMOV(X2, D0);
+ code.FMOV(X3, V0.D()[1]);
+ code.LDR(Xscratch0, l_addr);
+ code.BR(Xscratch0);
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(&conf));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+
+ return target;
+}
+
+A64AddressSpace::A64AddressSpace(const A64::UserConfig& conf)
+ : AddressSpace(conf.code_cache_size)
+ , conf(conf) {
+ EmitPrelude();
+}
+
+IR::Block A64AddressSpace::GenerateIR(IR::LocationDescriptor descriptor) const {
+ const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); };
+ IR::Block ir_block = A64::Translate(A64::LocationDescriptor{descriptor}, get_code,
+ {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct});
+
+ Optimization::A64CallbackConfigPass(ir_block, conf);
+ Optimization::NamingPass(ir_block);
+ if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) {
+ Optimization::A64GetSetElimination(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::ConstProp)) {
+ Optimization::ConstantPropagation(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::MiscIROpt)) {
+ Optimization::A64MergeInterpretBlocksPass(ir_block, conf.callbacks);
+ }
+ Optimization::VerificationPass(ir_block);
+
+ return ir_block;
+}
+
+void A64AddressSpace::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) {
+ InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
+}
+
+void A64AddressSpace::EmitPrelude() {
+ using namespace oaknut::util;
+
+ UnprotectCodeMemory();
+
+ prelude_info.read_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead8>(code, conf.callbacks);
+ prelude_info.read_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks);
+ prelude_info.read_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks);
+ prelude_info.read_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks);
+ prelude_info.read_memory_128 = EmitRead128CallTrampoline(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_8 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead8>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_16 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_32 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_64 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks);
+ prelude_info.wrapped_read_memory_128 = EmitWrappedRead128CallTrampoline(code, conf.callbacks);
+ prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead8, u8>(code, conf);
+ prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead16, u16>(code, conf);
+ prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead32, u32>(code, conf);
+ prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead64, u64>(code, conf);
+ prelude_info.exclusive_read_memory_128 = EmitExclusiveRead128CallTrampoline(code, conf);
+ prelude_info.write_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks);
+ prelude_info.write_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks);
+ prelude_info.write_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks);
+ prelude_info.write_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks);
+ prelude_info.write_memory_128 = EmitWrite128CallTrampoline(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_8 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_16 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_32 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_64 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks);
+ prelude_info.wrapped_write_memory_128 = EmitWrappedWrite128CallTrampoline(code, conf.callbacks);
+ prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf);
+ prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf);
+ prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf);
+ prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf);
+ prelude_info.exclusive_write_memory_128 = EmitExclusiveWrite128CallTrampoline(code, conf);
+ prelude_info.call_svc = EmitCallTrampoline<&A64::UserCallbacks::CallSVC>(code, conf.callbacks);
+ prelude_info.exception_raised = EmitCallTrampoline<&A64::UserCallbacks::ExceptionRaised>(code, conf.callbacks);
+ prelude_info.isb_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks);
+ prelude_info.ic_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionCacheOperationRaised>(code, conf.callbacks);
+ prelude_info.dc_raised = EmitCallTrampoline<&A64::UserCallbacks::DataCacheOperationRaised>(code, conf.callbacks);
+ prelude_info.get_cntpct = EmitCallTrampoline<&A64::UserCallbacks::GetCNTPCT>(code, conf.callbacks);
+ prelude_info.add_ticks = EmitCallTrampoline<&A64::UserCallbacks::AddTicks>(code, conf.callbacks);
+ prelude_info.get_ticks_remaining = EmitCallTrampoline<&A64::UserCallbacks::GetTicksRemaining>(code, conf.callbacks);
+
+ oaknut::Label return_from_run_code, l_return_to_dispatcher;
+
+ prelude_info.run_code = code.xptr<PreludeInfo::RunCodeFuncType>();
+ {
+ ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+
+ code.MOV(X19, X0);
+ code.MOV(Xstate, X1);
+ code.MOV(Xhalt, X2);
+ if (conf.page_table) {
+ code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+
+ if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ code.LDR(Xscratch0, l_return_to_dispatcher);
+ for (size_t i = 0; i < RSBCount; i++) {
+ code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry));
+ }
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.BL(prelude_info.get_ticks_remaining);
+ code.MOV(Xticks, X0);
+ code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
+ }
+
+ code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
+ code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
+ code.LDR(Wscratch0, Xstate, offsetof(A64JitState, fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+
+ code.BR(X19);
+ }
+
+ prelude_info.step_code = code.xptr<PreludeInfo::RunCodeFuncType>();
+ {
+ ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+
+ code.MOV(X19, X0);
+ code.MOV(Xstate, X1);
+ code.MOV(Xhalt, X2);
+ if (conf.page_table) {
+ code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+
+ if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ code.LDR(Xscratch0, l_return_to_dispatcher);
+ for (size_t i = 0; i < RSBCount; i++) {
+ code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry));
+ }
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.MOV(Xticks, 1);
+ code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run));
+ }
+
+ code.MRS(Xscratch1, oaknut::SystemReg::FPCR);
+ code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr));
+ code.LDR(Wscratch0, Xstate, offsetof(A64JitState, fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ oaknut::Label step_hr_loop;
+ code.l(step_hr_loop);
+ code.LDAXR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+ code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step));
+ code.STLXR(Wscratch1, Wscratch0, Xhalt);
+ code.CBNZ(Wscratch1, step_hr_loop);
+
+ code.BR(X19);
+ }
+
+ prelude_info.return_to_dispatcher = code.xptr<void*>();
+ {
+ oaknut::Label l_this, l_addr;
+
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, return_from_run_code);
+
+ if (conf.enable_cycle_counting) {
+ code.CMP(Xticks, 0);
+ code.B(LE, return_from_run_code);
+ }
+
+ code.LDR(X0, l_this);
+ code.MOV(X1, Xstate);
+ code.LDR(Xscratch0, l_addr);
+ code.BLR(Xscratch0);
+ code.BR(X0);
+
+ const auto fn = [](A64AddressSpace& self, A64JitState& context) -> CodePtr {
+ return self.GetOrEmit(context.GetLocationDescriptor());
+ };
+
+ code.align(8);
+ code.l(l_this);
+ code.dx(mcl::bit_cast<u64>(this));
+ code.l(l_addr);
+ code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn)));
+ }
+
+ prelude_info.return_from_run_code = code.xptr<void*>();
+ {
+ code.l(return_from_run_code);
+
+ if (conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ code.BL(prelude_info.add_ticks);
+ }
+
+ code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ oaknut::Label exit_hr_loop;
+ code.l(exit_hr_loop);
+ code.LDAXR(W0, Xhalt);
+ code.STLXR(Wscratch0, WZR, Xhalt);
+ code.CBNZ(Wscratch0, exit_hr_loop);
+
+ ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout));
+ code.RET();
+ }
+
+ code.align(8);
+ code.l(l_return_to_dispatcher);
+ code.dx(mcl::bit_cast<u64>(prelude_info.return_to_dispatcher));
+
+ prelude_info.end_of_prelude = code.offset();
+
+ mem.invalidate_all();
+ ProtectCodeMemory();
+}
+
+EmitConfig A64AddressSpace::GetEmitConfig() {
+ return EmitConfig{
+ .optimizations = conf.unsafe_optimizations ? conf.optimizations : conf.optimizations & all_safe_optimizations,
+
+ .hook_isb = conf.hook_isb,
+
+ .cntfreq_el0 = conf.cntfrq_el0,
+ .ctr_el0 = conf.ctr_el0,
+ .dczid_el0 = conf.dczid_el0,
+ .tpidrro_el0 = conf.tpidrro_el0,
+ .tpidr_el0 = conf.tpidr_el0,
+
+ .check_halt_on_memory_access = conf.check_halt_on_memory_access,
+
+ .page_table_pointer = mcl::bit_cast<u64>(conf.page_table),
+ .page_table_address_space_bits = conf.page_table_address_space_bits,
+ .page_table_pointer_mask_bits = conf.page_table_pointer_mask_bits,
+ .silently_mirror_page_table = conf.silently_mirror_page_table,
+ .absolute_offset_page_table = conf.absolute_offset_page_table,
+ .detect_misaligned_access_via_page_table = conf.detect_misaligned_access_via_page_table,
+ .only_detect_misalignment_via_page_table_on_page_boundary = conf.only_detect_misalignment_via_page_table_on_page_boundary,
+
+ .fastmem_pointer = mcl::bit_cast<u64>(conf.fastmem_pointer),
+ .recompile_on_fastmem_failure = conf.recompile_on_fastmem_failure,
+ .fastmem_address_space_bits = conf.fastmem_address_space_bits,
+ .silently_mirror_fastmem = conf.silently_mirror_fastmem,
+
+ .wall_clock_cntpct = conf.wall_clock_cntpct,
+ .enable_cycle_counting = conf.enable_cycle_counting,
+
+ .always_little_endian = true,
+
+ .descriptor_to_fpcr = [](const IR::LocationDescriptor& location) { return A64::LocationDescriptor{location}.FPCR(); },
+ .emit_cond = EmitA64Cond,
+ .emit_condition_failed_terminal = EmitA64ConditionFailedTerminal,
+ .emit_terminal = EmitA64Terminal,
+ .emit_check_memory_abort = EmitA64CheckMemoryAbort,
+
+ .state_nzcv_offset = offsetof(A64JitState, cpsr_nzcv),
+ .state_fpsr_offset = offsetof(A64JitState, fpsr),
+ .state_exclusive_state_offset = offsetof(A64JitState, exclusive_state),
+
+ .coprocessors{},
+
+ .very_verbose_debugging_output = conf.very_verbose_debugging_output,
+ };
+}
+
+void A64AddressSpace::RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo&) {
+ const A64::LocationDescriptor descriptor{block.Location()};
+ const A64::LocationDescriptor end_location{block.EndLocation()};
+ const auto range = boost::icl::discrete_interval<u64>::closed(descriptor.PC(), end_location.PC() - 1);
+ block_ranges.AddRange(range, descriptor);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h
new file mode 100644
index 0000000000..86eae4dd39
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h
@@ -0,0 +1,35 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/backend/arm64/address_space.h"
+#include "dynarmic/backend/block_range_information.h"
+#include "dynarmic/interface/A64/config.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmittedBlockInfo;
+
+class A64AddressSpace final : public AddressSpace {
+public:
+ explicit A64AddressSpace(const A64::UserConfig& conf);
+
+ IR::Block GenerateIR(IR::LocationDescriptor) const override;
+
+ void InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges);
+
+protected:
+ friend class A64Core;
+
+ void EmitPrelude();
+ EmitConfig GetEmitConfig() override;
+ void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) override;
+
+ const A64::UserConfig conf;
+ BlockRangeInformation<u64> block_ranges;
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h
new file mode 100644
index 0000000000..24fbb66b38
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h
@@ -0,0 +1,30 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/backend/arm64/a64_address_space.h"
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+class A64Core final {
+public:
+ explicit A64Core(const A64::UserConfig&) {}
+
+ HaltReason Run(A64AddressSpace& process, A64JitState& thread_ctx, volatile u32* halt_reason) {
+ const auto location_descriptor = thread_ctx.GetLocationDescriptor();
+ const auto entry_point = process.GetOrEmit(location_descriptor);
+ return process.prelude_info.run_code(entry_point, &thread_ctx, halt_reason);
+ }
+
+ HaltReason Step(A64AddressSpace& process, A64JitState& thread_ctx, volatile u32* halt_reason) {
+ const auto location_descriptor = A64::LocationDescriptor{thread_ctx.GetLocationDescriptor()}.SetSingleStepping(true);
+ const auto entry_point = process.GetOrEmit(location_descriptor);
+ return process.prelude_info.step_code(entry_point, &thread_ctx, halt_reason);
+ }
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp
new file mode 100644
index 0000000000..b80e8b39dc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp
@@ -0,0 +1,323 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <memory>
+#include <mutex>
+
+#include <boost/icl/interval_set.hpp>
+#include <mcl/assert.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/arm64/a64_address_space.h"
+#include "dynarmic/backend/arm64/a64_core.h"
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/common/atomic.h"
+#include "dynarmic/interface/A64/a64.h"
+#include "dynarmic/interface/A64/config.h"
+
+namespace Dynarmic::A64 {
+
+using namespace Backend::Arm64;
+
+struct Jit::Impl final {
+ Impl(Jit*, A64::UserConfig conf)
+ : conf(conf)
+ , current_address_space(conf)
+ , core(conf) {}
+
+ HaltReason Run() {
+ ASSERT(!is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason)));
+
+ is_executing = true;
+ SCOPE_EXIT {
+ is_executing = false;
+ };
+
+ HaltReason hr = core.Run(current_address_space, current_state, &halt_reason);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ HaltReason Step() {
+ ASSERT(!is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason)));
+
+ is_executing = true;
+ SCOPE_EXIT {
+ is_executing = false;
+ };
+
+ HaltReason hr = core.Step(current_address_space, current_state, &halt_reason);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ void ClearCache() {
+ std::unique_lock lock{invalidation_mutex};
+ invalidate_entire_cache = true;
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void InvalidateCacheRange(std::uint64_t start_address, std::size_t length) {
+ std::unique_lock lock{invalidation_mutex};
+ invalid_cache_ranges.add(boost::icl::discrete_interval<u64>::closed(start_address, start_address + length - 1));
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void Reset() {
+ current_state = {};
+ }
+
+ void HaltExecution(HaltReason hr) {
+ Atomic::Or(&halt_reason, static_cast<u32>(hr));
+ }
+
+ void ClearHalt(HaltReason hr) {
+ Atomic::And(&halt_reason, ~static_cast<u32>(hr));
+ }
+
+ std::uint64_t PC() const {
+ return current_state.pc;
+ }
+
+ void SetPC(std::uint64_t value) {
+ current_state.pc = value;
+ }
+
+ std::uint64_t SP() const {
+ return current_state.sp;
+ }
+
+ void SetSP(std::uint64_t value) {
+ current_state.sp = value;
+ }
+
+ std::array<std::uint64_t, 31>& Regs() {
+ return current_state.reg;
+ }
+
+ const std::array<std::uint64_t, 31>& Regs() const {
+ return current_state.reg;
+ }
+
+ std::array<std::uint64_t, 64>& VecRegs() {
+ return current_state.vec;
+ }
+
+ const std::array<std::uint64_t, 64>& VecRegs() const {
+ return current_state.vec;
+ }
+
+ std::uint32_t Fpcr() const {
+ return current_state.fpcr;
+ }
+
+ void SetFpcr(std::uint32_t value) {
+ current_state.fpcr = value;
+ }
+
+ std::uint32_t Fpsr() const {
+ return current_state.fpsr;
+ }
+
+ void SetFpsr(std::uint32_t value) {
+ current_state.fpsr = value;
+ }
+
+ std::uint32_t Pstate() const {
+ return current_state.cpsr_nzcv;
+ }
+
+ void SetPstate(std::uint32_t value) {
+ current_state.cpsr_nzcv = value;
+ }
+
+ void ClearExclusiveState() {
+ current_state.exclusive_state = false;
+ }
+
+ bool IsExecuting() const {
+ return is_executing;
+ }
+
+ void DumpDisassembly() const {
+ ASSERT_FALSE("Unimplemented");
+ }
+
+ std::vector<std::string> Disassemble() const {
+ ASSERT_FALSE("Unimplemented");
+ }
+
+private:
+ void PerformRequestedCacheInvalidation(HaltReason hr) {
+ if (Has(hr, HaltReason::CacheInvalidation)) {
+ std::unique_lock lock{invalidation_mutex};
+
+ ClearHalt(HaltReason::CacheInvalidation);
+
+ if (invalidate_entire_cache) {
+ current_address_space.ClearCache();
+
+ invalidate_entire_cache = false;
+ invalid_cache_ranges.clear();
+ return;
+ }
+
+ if (!invalid_cache_ranges.empty()) {
+ current_address_space.InvalidateCacheRanges(invalid_cache_ranges);
+
+ invalid_cache_ranges.clear();
+ return;
+ }
+ }
+ }
+
+ A64::UserConfig conf;
+ A64JitState current_state{};
+ A64AddressSpace current_address_space;
+ A64Core core;
+
+ volatile u32 halt_reason = 0;
+
+ std::mutex invalidation_mutex;
+ boost::icl::interval_set<u64> invalid_cache_ranges;
+ bool invalidate_entire_cache = false;
+ bool is_executing = false;
+};
+
+Jit::Jit(UserConfig conf)
+ : impl{std::make_unique<Jit::Impl>(this, conf)} {
+}
+
+Jit::~Jit() = default;
+
+HaltReason Jit::Run() {
+ return impl->Run();
+}
+
+HaltReason Jit::Step() {
+ return impl->Step();
+}
+
+void Jit::ClearCache() {
+ impl->ClearCache();
+}
+
+void Jit::InvalidateCacheRange(std::uint64_t start_address, std::size_t length) {
+ impl->InvalidateCacheRange(start_address, length);
+}
+
+void Jit::Reset() {
+ impl->Reset();
+}
+
+void Jit::HaltExecution(HaltReason hr) {
+ impl->HaltExecution(hr);
+}
+
+void Jit::ClearHalt(HaltReason hr) {
+ impl->ClearHalt(hr);
+}
+
+std::uint64_t Jit::GetSP() const {
+ return impl->SP();
+}
+
+void Jit::SetSP(std::uint64_t value) {
+ impl->SetSP(value);
+}
+
+std::uint64_t Jit::GetPC() const {
+ return impl->PC();
+}
+
+void Jit::SetPC(std::uint64_t value) {
+ impl->SetPC(value);
+}
+
+std::uint64_t Jit::GetRegister(std::size_t index) const {
+ return impl->Regs()[index];
+}
+
+void Jit::SetRegister(size_t index, std::uint64_t value) {
+ impl->Regs()[index] = value;
+}
+
+std::array<std::uint64_t, 31> Jit::GetRegisters() const {
+ return impl->Regs();
+}
+
+void Jit::SetRegisters(const std::array<std::uint64_t, 31>& value) {
+ impl->Regs() = value;
+}
+
+Vector Jit::GetVector(std::size_t index) const {
+ auto& vec = impl->VecRegs();
+ return {vec[index * 2], vec[index * 2 + 1]};
+}
+
+void Jit::SetVector(std::size_t index, Vector value) {
+ auto& vec = impl->VecRegs();
+ vec[index * 2] = value[0];
+ vec[index * 2 + 1] = value[1];
+}
+
+std::array<Vector, 32> Jit::GetVectors() const {
+ std::array<Vector, 32> ret;
+ std::memcpy(ret.data(), impl->VecRegs().data(), sizeof(ret));
+ return ret;
+}
+
+void Jit::SetVectors(const std::array<Vector, 32>& value) {
+ std::memcpy(impl->VecRegs().data(), value.data(), sizeof(value));
+}
+
+std::uint32_t Jit::GetFpcr() const {
+ return impl->Fpcr();
+}
+
+void Jit::SetFpcr(std::uint32_t value) {
+ impl->SetFpcr(value);
+}
+
+std::uint32_t Jit::GetFpsr() const {
+ return impl->Fpsr();
+}
+
+void Jit::SetFpsr(std::uint32_t value) {
+ impl->SetFpsr(value);
+}
+
+std::uint32_t Jit::GetPstate() const {
+ return impl->Pstate();
+}
+
+void Jit::SetPstate(std::uint32_t value) {
+ impl->SetPstate(value);
+}
+
+void Jit::ClearExclusiveState() {
+ impl->ClearExclusiveState();
+}
+
+bool Jit::IsExecuting() const {
+ return impl->IsExecuting();
+}
+
+void Jit::DumpDisassembly() const {
+ impl->DumpDisassembly();
+}
+
+std::vector<std::string> Jit::Disassemble() const {
+ return impl->Disassemble();
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h
new file mode 100644
index 0000000000..215e6987f3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h
@@ -0,0 +1,37 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct A64JitState {
+ std::array<u64, 31> reg{};
+ u64 sp = 0;
+ u64 pc = 0;
+
+ u32 cpsr_nzcv = 0;
+
+ alignas(16) std::array<u64, 64> vec{};
+
+ u32 exclusive_state = 0;
+
+ u32 fpsr = 0;
+ u32 fpcr = 0;
+
+ IR::LocationDescriptor GetLocationDescriptor() const {
+ const u64 fpcr_u64 = static_cast<u64>(fpcr & A64::LocationDescriptor::fpcr_mask) << A64::LocationDescriptor::fpcr_shift;
+ const u64 pc_u64 = pc & A64::LocationDescriptor::pc_mask;
+ return IR::LocationDescriptor{pc_u64 | fpcr_u64};
+ }
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp
new file mode 100644
index 0000000000..6d7b96379b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp
@@ -0,0 +1,91 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/abi.h"
+
+#include <vector>
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+#include <oaknut/oaknut.hpp>
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+static constexpr size_t gpr_size = 8;
+static constexpr size_t fpr_size = 16;
+
+struct FrameInfo {
+ std::vector<int> gprs;
+ std::vector<int> fprs;
+ size_t frame_size;
+ size_t gprs_size;
+ size_t fprs_size;
+};
+
+static std::vector<int> ListToIndexes(u32 list) {
+ std::vector<int> indexes;
+ for (int i = 0; i < 32; i++) {
+ if (mcl::bit::get_bit(i, list)) {
+ indexes.push_back(i);
+ }
+ }
+ return indexes;
+}
+
+static FrameInfo CalculateFrameInfo(RegisterList rl, size_t frame_size) {
+ const auto gprs = ListToIndexes(static_cast<u32>(rl));
+ const auto fprs = ListToIndexes(static_cast<u32>(rl >> 32));
+
+ const size_t num_gprs = gprs.size();
+ const size_t num_fprs = fprs.size();
+
+ const size_t gprs_size = (num_gprs + 1) / 2 * 16;
+ const size_t fprs_size = num_fprs * 16;
+
+ return {
+ gprs,
+ fprs,
+ frame_size,
+ gprs_size,
+ fprs_size,
+ };
+}
+
+#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \
+ if (frame_info.TYPE##s.size() > 0) { \
+ for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \
+ code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \
+ } \
+ if (frame_info.TYPE##s.size() % 2 == 1) { \
+ const size_t i = frame_info.TYPE##s.size() - 1; \
+ code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \
+ } \
+ }
+
+void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) {
+ const FrameInfo frame_info = CalculateFrameInfo(rl, frame_size);
+
+ code.SUB(SP, SP, frame_info.gprs_size + frame_info.fprs_size);
+
+ DO_IT(gpr, XReg, STP, STR, 0)
+ DO_IT(fpr, QReg, STP, STR, frame_info.gprs_size)
+
+ code.SUB(SP, SP, frame_info.frame_size);
+}
+
+void ABI_PopRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) {
+ const FrameInfo frame_info = CalculateFrameInfo(rl, frame_size);
+
+ code.ADD(SP, SP, frame_info.frame_size);
+
+ DO_IT(gpr, XReg, LDP, LDR, 0)
+ DO_IT(fpr, QReg, LDP, LDR, frame_info.gprs_size)
+
+ code.ADD(SP, SP, frame_info.gprs_size + frame_info.fprs_size);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/abi.h b/externals/dynarmic/src/dynarmic/backend/arm64/abi.h
new file mode 100644
index 0000000000..609b06cd22
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/abi.h
@@ -0,0 +1,78 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <initializer_list>
+#include <stdexcept>
+#include <type_traits>
+
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <mcl/stdint.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/common/always_false.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+constexpr oaknut::XReg Xstate{28};
+constexpr oaknut::XReg Xhalt{27};
+constexpr oaknut::XReg Xticks{26};
+constexpr oaknut::XReg Xfastmem{25};
+constexpr oaknut::XReg Xpagetable{24};
+
+constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17}, Xscratch2{30};
+constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17}, Wscratch2{30};
+
+template<size_t bitsize>
+constexpr auto Rscratch0() {
+ if constexpr (bitsize == 32) {
+ return Wscratch0;
+ } else if constexpr (bitsize == 64) {
+ return Xscratch0;
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<bitsize>>);
+ }
+}
+
+template<size_t bitsize>
+constexpr auto Rscratch1() {
+ if constexpr (bitsize == 32) {
+ return Wscratch1;
+ } else if constexpr (bitsize == 64) {
+ return Xscratch1;
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<bitsize>>);
+ }
+}
+
+constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+constexpr std::initializer_list<int> FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+using RegisterList = u64;
+
+constexpr RegisterList ToRegList(oaknut::Reg reg) {
+ if (reg.is_vector()) {
+ return RegisterList{1} << (reg.index() + 32);
+ }
+
+ if (reg.index() == 31) {
+ throw std::out_of_range("ZR not allowed in reg list");
+ }
+
+ if (reg.index() == -1) {
+ return RegisterList{1} << 31;
+ }
+
+ return RegisterList{1} << reg.index();
+}
+
+constexpr RegisterList ABI_CALLEE_SAVE = 0x0000ff00'7ff80000;
+constexpr RegisterList ABI_CALLER_SAVE = 0xffffffff'4000ffff;
+
+void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t stack_space);
+void ABI_PopRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t stack_space);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp
new file mode 100644
index 0000000000..10df477d19
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp
@@ -0,0 +1,342 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/a64_address_space.h"
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/devirtualize.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/stack_layout.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+AddressSpace::AddressSpace(size_t code_cache_size)
+ : code_cache_size(code_cache_size)
+ , mem(code_cache_size)
+ , code(mem.ptr(), mem.ptr())
+ , fastmem_manager(exception_handler) {
+ ASSERT_MSG(code_cache_size <= 128 * 1024 * 1024, "code_cache_size > 128 MiB not currently supported");
+
+ exception_handler.Register(mem, code_cache_size);
+ exception_handler.SetFastmemCallback([this](u64 host_pc) {
+ return FastmemCallback(host_pc);
+ });
+}
+
+AddressSpace::~AddressSpace() = default;
+
+CodePtr AddressSpace::Get(IR::LocationDescriptor descriptor) {
+ if (const auto iter = block_entries.find(descriptor); iter != block_entries.end()) {
+ return iter->second;
+ }
+ return nullptr;
+}
+
+std::optional<IR::LocationDescriptor> AddressSpace::ReverseGetLocation(CodePtr host_pc) {
+ if (auto iter = reverse_block_entries.upper_bound(host_pc); iter != reverse_block_entries.begin()) {
+ // upper_bound locates the first value greater than host_pc, so we need to decrement
+ --iter;
+ return iter->second;
+ }
+ return std::nullopt;
+}
+
+CodePtr AddressSpace::ReverseGetEntryPoint(CodePtr host_pc) {
+ if (auto iter = reverse_block_entries.upper_bound(host_pc); iter != reverse_block_entries.begin()) {
+ // upper_bound locates the first value greater than host_pc, so we need to decrement
+ --iter;
+ return iter->first;
+ }
+ return nullptr;
+}
+
+CodePtr AddressSpace::GetOrEmit(IR::LocationDescriptor descriptor) {
+ if (CodePtr block_entry = Get(descriptor)) {
+ return block_entry;
+ }
+
+ IR::Block ir_block = GenerateIR(descriptor);
+ const EmittedBlockInfo block_info = Emit(std::move(ir_block));
+ return block_info.entry_point;
+}
+
+void AddressSpace::InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& descriptors) {
+ UnprotectCodeMemory();
+
+ for (const auto& descriptor : descriptors) {
+ const auto iter = block_entries.find(descriptor);
+ if (iter == block_entries.end()) {
+ continue;
+ }
+
+ // Unlink before removal because InvalidateBasicBlocks can be called within a fastmem callback,
+ // and the currently executing block may have references to itself which need to be unlinked.
+ RelinkForDescriptor(descriptor, nullptr);
+
+ block_entries.erase(iter);
+ }
+
+ ProtectCodeMemory();
+}
+
+void AddressSpace::ClearCache() {
+ block_entries.clear();
+ reverse_block_entries.clear();
+ block_infos.clear();
+ block_references.clear();
+ code.set_offset(prelude_info.end_of_prelude);
+}
+
+size_t AddressSpace::GetRemainingSize() {
+ return code_cache_size - static_cast<size_t>(code.offset());
+}
+
+EmittedBlockInfo AddressSpace::Emit(IR::Block block) {
+ if (GetRemainingSize() < 1024 * 1024) {
+ ClearCache();
+ }
+
+ UnprotectCodeMemory();
+
+ EmittedBlockInfo block_info = EmitArm64(code, std::move(block), GetEmitConfig(), fastmem_manager);
+
+ ASSERT(block_entries.insert({block.Location(), block_info.entry_point}).second);
+ ASSERT(reverse_block_entries.insert({block_info.entry_point, block.Location()}).second);
+ ASSERT(block_infos.insert({block_info.entry_point, block_info}).second);
+
+ Link(block_info);
+ RelinkForDescriptor(block.Location(), block_info.entry_point);
+
+ mem.invalidate(reinterpret_cast<u32*>(block_info.entry_point), block_info.size);
+ ProtectCodeMemory();
+
+ RegisterNewBasicBlock(block, block_info);
+
+ return block_info;
+}
+
+void AddressSpace::Link(EmittedBlockInfo& block_info) {
+ using namespace oaknut;
+ using namespace oaknut::util;
+
+ for (auto [ptr_offset, target] : block_info.relocations) {
+ CodeGenerator c{mem.ptr(), mem.ptr()};
+ c.set_xptr(reinterpret_cast<u32*>(block_info.entry_point + ptr_offset));
+
+ switch (target) {
+ case LinkTarget::ReturnToDispatcher:
+ c.B(prelude_info.return_to_dispatcher);
+ break;
+ case LinkTarget::ReturnFromRunCode:
+ c.B(prelude_info.return_from_run_code);
+ break;
+ case LinkTarget::ReadMemory8:
+ c.BL(prelude_info.read_memory_8);
+ break;
+ case LinkTarget::ReadMemory16:
+ c.BL(prelude_info.read_memory_16);
+ break;
+ case LinkTarget::ReadMemory32:
+ c.BL(prelude_info.read_memory_32);
+ break;
+ case LinkTarget::ReadMemory64:
+ c.BL(prelude_info.read_memory_64);
+ break;
+ case LinkTarget::ReadMemory128:
+ c.BL(prelude_info.read_memory_128);
+ break;
+ case LinkTarget::WrappedReadMemory8:
+ c.BL(prelude_info.wrapped_read_memory_8);
+ break;
+ case LinkTarget::WrappedReadMemory16:
+ c.BL(prelude_info.wrapped_read_memory_16);
+ break;
+ case LinkTarget::WrappedReadMemory32:
+ c.BL(prelude_info.wrapped_read_memory_32);
+ break;
+ case LinkTarget::WrappedReadMemory64:
+ c.BL(prelude_info.wrapped_read_memory_64);
+ break;
+ case LinkTarget::WrappedReadMemory128:
+ c.BL(prelude_info.wrapped_read_memory_128);
+ break;
+ case LinkTarget::ExclusiveReadMemory8:
+ c.BL(prelude_info.exclusive_read_memory_8);
+ break;
+ case LinkTarget::ExclusiveReadMemory16:
+ c.BL(prelude_info.exclusive_read_memory_16);
+ break;
+ case LinkTarget::ExclusiveReadMemory32:
+ c.BL(prelude_info.exclusive_read_memory_32);
+ break;
+ case LinkTarget::ExclusiveReadMemory64:
+ c.BL(prelude_info.exclusive_read_memory_64);
+ break;
+ case LinkTarget::ExclusiveReadMemory128:
+ c.BL(prelude_info.exclusive_read_memory_128);
+ break;
+ case LinkTarget::WriteMemory8:
+ c.BL(prelude_info.write_memory_8);
+ break;
+ case LinkTarget::WriteMemory16:
+ c.BL(prelude_info.write_memory_16);
+ break;
+ case LinkTarget::WriteMemory32:
+ c.BL(prelude_info.write_memory_32);
+ break;
+ case LinkTarget::WriteMemory64:
+ c.BL(prelude_info.write_memory_64);
+ break;
+ case LinkTarget::WriteMemory128:
+ c.BL(prelude_info.write_memory_128);
+ break;
+ case LinkTarget::WrappedWriteMemory8:
+ c.BL(prelude_info.wrapped_write_memory_8);
+ break;
+ case LinkTarget::WrappedWriteMemory16:
+ c.BL(prelude_info.wrapped_write_memory_16);
+ break;
+ case LinkTarget::WrappedWriteMemory32:
+ c.BL(prelude_info.wrapped_write_memory_32);
+ break;
+ case LinkTarget::WrappedWriteMemory64:
+ c.BL(prelude_info.wrapped_write_memory_64);
+ break;
+ case LinkTarget::WrappedWriteMemory128:
+ c.BL(prelude_info.wrapped_write_memory_128);
+ break;
+ case LinkTarget::ExclusiveWriteMemory8:
+ c.BL(prelude_info.exclusive_write_memory_8);
+ break;
+ case LinkTarget::ExclusiveWriteMemory16:
+ c.BL(prelude_info.exclusive_write_memory_16);
+ break;
+ case LinkTarget::ExclusiveWriteMemory32:
+ c.BL(prelude_info.exclusive_write_memory_32);
+ break;
+ case LinkTarget::ExclusiveWriteMemory64:
+ c.BL(prelude_info.exclusive_write_memory_64);
+ break;
+ case LinkTarget::ExclusiveWriteMemory128:
+ c.BL(prelude_info.exclusive_write_memory_128);
+ break;
+ case LinkTarget::CallSVC:
+ c.BL(prelude_info.call_svc);
+ break;
+ case LinkTarget::ExceptionRaised:
+ c.BL(prelude_info.exception_raised);
+ break;
+ case LinkTarget::InstructionSynchronizationBarrierRaised:
+ c.BL(prelude_info.isb_raised);
+ break;
+ case LinkTarget::InstructionCacheOperationRaised:
+ c.BL(prelude_info.ic_raised);
+ break;
+ case LinkTarget::DataCacheOperationRaised:
+ c.BL(prelude_info.dc_raised);
+ break;
+ case LinkTarget::GetCNTPCT:
+ c.BL(prelude_info.get_cntpct);
+ break;
+ case LinkTarget::AddTicks:
+ c.BL(prelude_info.add_ticks);
+ break;
+ case LinkTarget::GetTicksRemaining:
+ c.BL(prelude_info.get_ticks_remaining);
+ break;
+ default:
+ ASSERT_FALSE("Invalid relocation target");
+ }
+ }
+
+ for (auto [target_descriptor, list] : block_info.block_relocations) {
+ block_references[target_descriptor].insert(block_info.entry_point);
+ LinkBlockLinks(block_info.entry_point, Get(target_descriptor), list);
+ }
+}
+
+void AddressSpace::LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector<BlockRelocation>& block_relocations_list) {
+ using namespace oaknut;
+ using namespace oaknut::util;
+
+ for (auto [ptr_offset, type] : block_relocations_list) {
+ CodeGenerator c{mem.ptr(), mem.ptr()};
+ c.set_xptr(reinterpret_cast<u32*>(entry_point + ptr_offset));
+
+ switch (type) {
+ case BlockRelocationType::Branch:
+ if (target_ptr) {
+ c.B((void*)target_ptr);
+ } else {
+ c.NOP();
+ }
+ break;
+ case BlockRelocationType::MoveToScratch1:
+ if (target_ptr) {
+ c.ADRL(Xscratch1, (void*)target_ptr);
+ } else {
+ c.ADRL(Xscratch1, prelude_info.return_to_dispatcher);
+ }
+ break;
+ default:
+ ASSERT_FALSE("Invalid BlockRelocationType");
+ }
+ }
+}
+
+void AddressSpace::RelinkForDescriptor(IR::LocationDescriptor target_descriptor, CodePtr target_ptr) {
+ for (auto code_ptr : block_references[target_descriptor]) {
+ if (auto block_iter = block_infos.find(code_ptr); block_iter != block_infos.end()) {
+ const EmittedBlockInfo& block_info = block_iter->second;
+
+ if (auto relocation_iter = block_info.block_relocations.find(target_descriptor); relocation_iter != block_info.block_relocations.end()) {
+ LinkBlockLinks(block_info.entry_point, target_ptr, relocation_iter->second);
+ }
+
+ mem.invalidate(reinterpret_cast<u32*>(block_info.entry_point), block_info.size);
+ }
+ }
+}
+
+FakeCall AddressSpace::FastmemCallback(u64 host_pc) {
+ {
+ const auto host_ptr = mcl::bit_cast<CodePtr>(host_pc);
+
+ const auto entry_point = ReverseGetEntryPoint(host_ptr);
+ if (!entry_point) {
+ goto fail;
+ }
+
+ const auto block_info = block_infos.find(entry_point);
+ if (block_info == block_infos.end()) {
+ goto fail;
+ }
+
+ const auto patch_entry = block_info->second.fastmem_patch_info.find(host_ptr - entry_point);
+ if (patch_entry == block_info->second.fastmem_patch_info.end()) {
+ goto fail;
+ }
+
+ const auto fc = patch_entry->second.fc;
+
+ if (patch_entry->second.recompile) {
+ const auto marker = patch_entry->second.marker;
+ fastmem_manager.MarkDoNotFastmem(marker);
+ InvalidateBasicBlocks({std::get<0>(marker)});
+ }
+
+ return fc;
+ }
+
+fail:
+ fmt::print("dynarmic: Segfault happened within JITted code at host_pc = {:016x}\n", host_pc);
+ fmt::print("Segfault wasn't at a fastmem patch location!\n");
+ ASSERT_FALSE("segfault");
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h
new file mode 100644
index 0000000000..a5fe7513e2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h
@@ -0,0 +1,136 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <map>
+#include <optional>
+
+#include <mcl/stdint.hpp>
+#include <oaknut/code_block.hpp>
+#include <oaknut/oaknut.hpp>
+#include <tsl/robin_map.h>
+#include <tsl/robin_set.h>
+
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/fastmem.h"
+#include "dynarmic/interface/halt_reason.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+class AddressSpace {
+public:
+ explicit AddressSpace(size_t code_cache_size);
+ virtual ~AddressSpace();
+
+ virtual IR::Block GenerateIR(IR::LocationDescriptor) const = 0;
+
+ CodePtr Get(IR::LocationDescriptor descriptor);
+
+ // Returns "most likely" LocationDescriptor assocated with the emitted code at that location
+ std::optional<IR::LocationDescriptor> ReverseGetLocation(CodePtr host_pc);
+
+ // Returns "most likely" entry_point associated with the emitted code at that location
+ CodePtr ReverseGetEntryPoint(CodePtr host_pc);
+
+ CodePtr GetOrEmit(IR::LocationDescriptor descriptor);
+
+ void InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& descriptors);
+
+ void ClearCache();
+
+protected:
+ virtual EmitConfig GetEmitConfig() = 0;
+ virtual void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) = 0;
+
+ void ProtectCodeMemory() {
+#if defined(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT) || defined(__APPLE__) || defined(__OpenBSD__)
+ mem.protect();
+#endif
+ }
+
+ void UnprotectCodeMemory() {
+#if defined(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT) || defined(__APPLE__) || defined(__OpenBSD__)
+ mem.unprotect();
+#endif
+ }
+
+ size_t GetRemainingSize();
+ EmittedBlockInfo Emit(IR::Block ir_block);
+ void Link(EmittedBlockInfo& block);
+ void LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector<BlockRelocation>& block_relocations_list);
+ void RelinkForDescriptor(IR::LocationDescriptor target_descriptor, CodePtr target_ptr);
+
+ FakeCall FastmemCallback(u64 host_pc);
+
+ const size_t code_cache_size;
+ oaknut::CodeBlock mem;
+ oaknut::CodeGenerator code;
+
+ // A IR::LocationDescriptor will have one current CodePtr.
+ // However, there can be multiple other CodePtrs which are older, previously invalidated blocks.
+ tsl::robin_map<IR::LocationDescriptor, CodePtr> block_entries;
+ std::map<CodePtr, IR::LocationDescriptor> reverse_block_entries;
+ tsl::robin_map<CodePtr, EmittedBlockInfo> block_infos;
+ tsl::robin_map<IR::LocationDescriptor, tsl::robin_set<CodePtr>> block_references;
+
+ ExceptionHandler exception_handler;
+ FastmemManager fastmem_manager;
+
+ struct PreludeInfo {
+ std::ptrdiff_t end_of_prelude;
+
+ using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, void* jit_state, volatile u32* halt_reason);
+ RunCodeFuncType run_code;
+ RunCodeFuncType step_code;
+ void* return_to_dispatcher;
+ void* return_from_run_code;
+
+ void* read_memory_8;
+ void* read_memory_16;
+ void* read_memory_32;
+ void* read_memory_64;
+ void* read_memory_128;
+ void* wrapped_read_memory_8;
+ void* wrapped_read_memory_16;
+ void* wrapped_read_memory_32;
+ void* wrapped_read_memory_64;
+ void* wrapped_read_memory_128;
+ void* exclusive_read_memory_8;
+ void* exclusive_read_memory_16;
+ void* exclusive_read_memory_32;
+ void* exclusive_read_memory_64;
+ void* exclusive_read_memory_128;
+ void* write_memory_8;
+ void* write_memory_16;
+ void* write_memory_32;
+ void* write_memory_64;
+ void* write_memory_128;
+ void* wrapped_write_memory_8;
+ void* wrapped_write_memory_16;
+ void* wrapped_write_memory_32;
+ void* wrapped_write_memory_64;
+ void* wrapped_write_memory_128;
+ void* exclusive_write_memory_8;
+ void* exclusive_write_memory_16;
+ void* exclusive_write_memory_32;
+ void* exclusive_write_memory_64;
+ void* exclusive_write_memory_128;
+
+ void* call_svc;
+ void* exception_raised;
+ void* dc_raised;
+ void* ic_raised;
+ void* isb_raised;
+
+ void* get_cntpct;
+ void* add_ticks;
+ void* get_ticks_remaining;
+ } prelude_info;
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h b/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h
new file mode 100644
index 0000000000..c433fd0126
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h
@@ -0,0 +1,57 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit_cast.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/function_info.hpp>
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct DevirtualizedCall {
+ u64 fn_ptr;
+ u64 this_ptr;
+};
+
+// https://rants.vastheman.com/2021/09/21/msvc/
+template<auto mfp>
+DevirtualizedCall DevirtualizeWindows(mcl::class_type<decltype(mfp)>* this_) {
+ static_assert(sizeof(mfp) == 8);
+ return DevirtualizedCall{mcl::bit_cast<u64>(mfp), reinterpret_cast<u64>(this_)};
+}
+
+// https://github.com/ARM-software/abi-aa/blob/main/cppabi64/cppabi64.rst#representation-of-pointer-to-member-function
+template<auto mfp>
+DevirtualizedCall DevirtualizeDefault(mcl::class_type<decltype(mfp)>* this_) {
+ struct MemberFunctionPointer {
+ // Address of non-virtual function or index into vtable.
+ u64 ptr;
+ // LSB is discriminator for if function is virtual. Other bits are this adjustment.
+ u64 adj;
+ } mfp_struct = mcl::bit_cast<MemberFunctionPointer>(mfp);
+
+ static_assert(sizeof(MemberFunctionPointer) == 16);
+ static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp));
+
+ u64 fn_ptr = mfp_struct.ptr;
+ u64 this_ptr = mcl::bit_cast<u64>(this_) + (mfp_struct.adj >> 1);
+ if (mfp_struct.adj & 1) {
+ u64 vtable = mcl::bit_cast_pointee<u64>(this_ptr);
+ fn_ptr = mcl::bit_cast_pointee<u64>(vtable + fn_ptr);
+ }
+ return DevirtualizedCall{fn_ptr, this_ptr};
+}
+
+template<auto mfp>
+DevirtualizedCall Devirtualize(mcl::class_type<decltype(mfp)>* this_) {
+#if defined(_WIN32) && defined(_MSC_VER)
+ return DevirtualizeWindows<mfp>(this_);
+#else
+ return DevirtualizeDefault<mfp>(this_);
+#endif
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp
new file mode 100644
index 0000000000..1ecb5424da
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp
@@ -0,0 +1,290 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/emit_arm64.h"
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/backend/arm64/verbose_debugging_output.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<>
+void EmitIR<IR::Opcode::Void>(oaknut::CodeGenerator&, EmitContext&, IR::Inst*) {}
+
+template<>
+void EmitIR<IR::Opcode::Identity>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::Breakpoint>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.BRK(0);
+}
+
+template<>
+void EmitIR<IR::Opcode::CallHostFunction>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.PrepareForCall(args[1], args[2], args[3]);
+ code.MOV(Xscratch0, args[0].GetImmediateU64());
+ code.BLR(Xscratch0);
+}
+
+template<>
+void EmitIR<IR::Opcode::PushRSB>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ if (!ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[0].IsImmediate());
+ const IR::LocationDescriptor target{args[0].GetImmediateU64()};
+
+ code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+ code.ADD(Wscratch2, Wscratch2, sizeof(RSBEntry));
+ code.AND(Wscratch2, Wscratch2, RSBIndexMask);
+ code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+ code.ADD(Xscratch2, SP, Xscratch2);
+
+ code.MOV(Xscratch0, target.Value());
+ EmitBlockLinkRelocation(code, ctx, target, BlockRelocationType::MoveToScratch1);
+ code.STP(Xscratch0, Xscratch1, Xscratch2, offsetof(StackLayout, rsb));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetCarryFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(ctx.reg_alloc.WasValueDefined(inst));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetOverflowFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(ctx.reg_alloc.WasValueDefined(inst));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetGEFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(ctx.reg_alloc.WasValueDefined(inst));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetNZCVFromOp>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (ctx.reg_alloc.WasValueDefined(inst)) {
+ return;
+ }
+
+ switch (args[0].GetType()) {
+ case IR::Type::U32: {
+ auto Wvalue = ctx.reg_alloc.ReadW(args[0]);
+ auto flags = ctx.reg_alloc.WriteFlags(inst);
+ RegAlloc::Realize(Wvalue, flags);
+
+ code.TST(*Wvalue, Wvalue);
+ break;
+ }
+ case IR::Type::U64: {
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ auto flags = ctx.reg_alloc.WriteFlags(inst);
+ RegAlloc::Realize(Xvalue, flags);
+
+ code.TST(*Xvalue, Xvalue);
+ break;
+ }
+ default:
+ ASSERT_FALSE("Invalid type for GetNZCVFromOp");
+ break;
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::GetNZFromOp>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (ctx.reg_alloc.WasValueDefined(inst)) {
+ return;
+ }
+
+ switch (args[0].GetType()) {
+ case IR::Type::U32: {
+ auto Wvalue = ctx.reg_alloc.ReadW(args[0]);
+ auto flags = ctx.reg_alloc.WriteFlags(inst);
+ RegAlloc::Realize(Wvalue, flags);
+
+ code.TST(*Wvalue, *Wvalue);
+ break;
+ }
+ case IR::Type::U64: {
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ auto flags = ctx.reg_alloc.WriteFlags(inst);
+ RegAlloc::Realize(Xvalue, flags);
+
+ code.TST(*Xvalue, *Xvalue);
+ break;
+ }
+ default:
+ ASSERT_FALSE("Invalid type for GetNZFromOp");
+ break;
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::GetUpperFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(ctx.reg_alloc.WasValueDefined(inst));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetLowerFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(ctx.reg_alloc.WasValueDefined(inst));
+}
+
+template<>
+void EmitIR<IR::Opcode::GetCFlagFromNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wc = ctx.reg_alloc.WriteW(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wc, Wnzcv);
+
+ code.AND(Wc, Wnzcv, 1 << 29);
+}
+
+template<>
+void EmitIR<IR::Opcode::NZCVFromPackedFlags>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext& ctx, size_t cycles_to_add) {
+ if (!ctx.conf.enable_cycle_counting) {
+ return;
+ }
+ if (cycles_to_add == 0) {
+ return;
+ }
+
+ if (oaknut::AddSubImm::is_valid(cycles_to_add)) {
+ code.SUB(Xticks, Xticks, cycles_to_add);
+ } else {
+ code.MOV(Xscratch1, cycles_to_add);
+ code.SUB(Xticks, Xticks, Xscratch1);
+ }
+}
+
+EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf, FastmemManager& fastmem_manager) {
+ if (conf.very_verbose_debugging_output) {
+ std::puts(IR::DumpBlock(block).c_str());
+ }
+
+ EmittedBlockInfo ebi;
+
+ FpsrManager fpsr_manager{code, conf.state_fpsr_offset};
+ RegAlloc reg_alloc{code, fpsr_manager, GPR_ORDER, FPR_ORDER};
+ EmitContext ctx{block, reg_alloc, conf, ebi, fpsr_manager, fastmem_manager, {}};
+
+ ebi.entry_point = code.xptr<CodePtr>();
+
+ if (ctx.block.GetCondition() == IR::Cond::AL) {
+ ASSERT(!ctx.block.HasConditionFailedLocation());
+ } else {
+ ASSERT(ctx.block.HasConditionFailedLocation());
+ oaknut::Label pass;
+
+ pass = conf.emit_cond(code, ctx, ctx.block.GetCondition());
+ EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount());
+ conf.emit_condition_failed_terminal(code, ctx);
+
+ code.l(pass);
+ }
+
+ for (auto iter = block.begin(); iter != block.end(); ++iter) {
+ IR::Inst* inst = &*iter;
+
+ switch (inst->GetOpcode()) {
+#define OPCODE(name, type, ...) \
+ case IR::Opcode::name: \
+ EmitIR<IR::Opcode::name>(code, ctx, inst); \
+ break;
+#define A32OPC(name, type, ...) \
+ case IR::Opcode::A32##name: \
+ EmitIR<IR::Opcode::A32##name>(code, ctx, inst); \
+ break;
+#define A64OPC(name, type, ...) \
+ case IR::Opcode::A64##name: \
+ EmitIR<IR::Opcode::A64##name>(code, ctx, inst); \
+ break;
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+ default:
+ ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode());
+ break;
+ }
+
+ reg_alloc.UpdateAllUses();
+ reg_alloc.AssertAllUnlocked();
+
+ if (conf.very_verbose_debugging_output) {
+ EmitVerboseDebuggingOutput(code, ctx);
+ }
+ }
+
+ fpsr_manager.Spill();
+
+ reg_alloc.AssertNoMoreUses();
+
+ EmitAddCycles(code, ctx, block.CycleCount());
+ conf.emit_terminal(code, ctx);
+ code.BRK(0);
+
+ for (const auto& deferred_emit : ctx.deferred_emits) {
+ deferred_emit();
+ }
+ code.BRK(0);
+
+ ebi.size = code.xptr<CodePtr>() - ebi.entry_point;
+ return ebi;
+}
+
+void EmitRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, LinkTarget link_target) {
+ ctx.ebi.relocations.emplace_back(Relocation{code.xptr<CodePtr>() - ctx.ebi.entry_point, link_target});
+ code.NOP();
+}
+
+void EmitBlockLinkRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, const IR::LocationDescriptor& descriptor, BlockRelocationType type) {
+ ctx.ebi.block_relocations[descriptor].emplace_back(BlockRelocation{code.xptr<CodePtr>() - ctx.ebi.entry_point, type});
+ switch (type) {
+ case BlockRelocationType::Branch:
+ code.NOP();
+ break;
+ case BlockRelocationType::MoveToScratch1:
+ code.BRK(0);
+ code.NOP();
+ break;
+ default:
+ UNREACHABLE();
+ }
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h
new file mode 100644
index 0000000000..6ac6207442
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h
@@ -0,0 +1,181 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+#include <tsl/robin_map.h>
+
+#include "dynarmic/backend/arm64/fastmem.h"
+#include "dynarmic/interface/A32/coprocessor.h"
+#include "dynarmic/interface/optimization_flags.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace oaknut {
+struct CodeGenerator;
+struct Label;
+} // namespace oaknut
+
+namespace Dynarmic::FP {
+class FPCR;
+} // namespace Dynarmic::FP
+
+namespace Dynarmic::IR {
+class Block;
+class Inst;
+enum class Cond;
+enum class Opcode;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmitContext;
+
+using CodePtr = std::byte*;
+
+enum class LinkTarget {
+ ReturnToDispatcher,
+ ReturnFromRunCode,
+ ReadMemory8,
+ ReadMemory16,
+ ReadMemory32,
+ ReadMemory64,
+ ReadMemory128,
+ WrappedReadMemory8,
+ WrappedReadMemory16,
+ WrappedReadMemory32,
+ WrappedReadMemory64,
+ WrappedReadMemory128,
+ ExclusiveReadMemory8,
+ ExclusiveReadMemory16,
+ ExclusiveReadMemory32,
+ ExclusiveReadMemory64,
+ ExclusiveReadMemory128,
+ WriteMemory8,
+ WriteMemory16,
+ WriteMemory32,
+ WriteMemory64,
+ WriteMemory128,
+ WrappedWriteMemory8,
+ WrappedWriteMemory16,
+ WrappedWriteMemory32,
+ WrappedWriteMemory64,
+ WrappedWriteMemory128,
+ ExclusiveWriteMemory8,
+ ExclusiveWriteMemory16,
+ ExclusiveWriteMemory32,
+ ExclusiveWriteMemory64,
+ ExclusiveWriteMemory128,
+ CallSVC,
+ ExceptionRaised,
+ InstructionSynchronizationBarrierRaised,
+ InstructionCacheOperationRaised,
+ DataCacheOperationRaised,
+ GetCNTPCT,
+ AddTicks,
+ GetTicksRemaining,
+};
+
+struct Relocation {
+ std::ptrdiff_t code_offset;
+ LinkTarget target;
+};
+
+enum class BlockRelocationType {
+ Branch,
+ MoveToScratch1,
+};
+
+struct BlockRelocation {
+ std::ptrdiff_t code_offset;
+ BlockRelocationType type;
+};
+
+struct EmittedBlockInfo {
+ CodePtr entry_point;
+ size_t size;
+ std::vector<Relocation> relocations;
+ tsl::robin_map<IR::LocationDescriptor, std::vector<BlockRelocation>> block_relocations;
+ tsl::robin_map<std::ptrdiff_t, FastmemPatchInfo> fastmem_patch_info;
+};
+
+struct EmitConfig {
+ OptimizationFlag optimizations;
+ bool HasOptimization(OptimizationFlag f) const { return (f & optimizations) != no_optimizations; }
+
+ bool hook_isb;
+
+ // System registers
+ u64 cntfreq_el0;
+ u32 ctr_el0;
+ u32 dczid_el0;
+ const u64* tpidrro_el0;
+ u64* tpidr_el0;
+
+ // Memory
+ bool check_halt_on_memory_access;
+
+ // Page table
+ u64 page_table_pointer;
+ size_t page_table_address_space_bits;
+ int page_table_pointer_mask_bits;
+ bool silently_mirror_page_table;
+ bool absolute_offset_page_table;
+ u8 detect_misaligned_access_via_page_table;
+ bool only_detect_misalignment_via_page_table_on_page_boundary;
+
+ // Fastmem
+ u64 fastmem_pointer;
+ bool recompile_on_fastmem_failure;
+ size_t fastmem_address_space_bits;
+ bool silently_mirror_fastmem;
+
+ // Timing
+ bool wall_clock_cntpct;
+ bool enable_cycle_counting;
+
+ // Endianness
+ bool always_little_endian;
+
+ // Frontend specific callbacks
+ FP::FPCR (*descriptor_to_fpcr)(const IR::LocationDescriptor& descriptor);
+ oaknut::Label (*emit_cond)(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond);
+ void (*emit_condition_failed_terminal)(oaknut::CodeGenerator& code, EmitContext& ctx);
+ void (*emit_terminal)(oaknut::CodeGenerator& code, EmitContext& ctx);
+ void (*emit_check_memory_abort)(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end);
+
+ // State offsets
+ size_t state_nzcv_offset;
+ size_t state_fpsr_offset;
+ size_t state_exclusive_state_offset;
+
+ // A32 specific
+ std::array<std::shared_ptr<A32::Coprocessor>, 16> coprocessors{};
+
+ // Debugging
+ bool very_verbose_debugging_output;
+};
+
+EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& emit_conf, FastmemManager& fastmem_manager);
+
+template<IR::Opcode op>
+void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+void EmitRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, LinkTarget link_target);
+void EmitBlockLinkRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, const IR::LocationDescriptor& descriptor, BlockRelocationType type);
+oaknut::Label EmitA32Cond(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond);
+oaknut::Label EmitA64Cond(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond);
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx);
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx);
+void EmitA32ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx);
+void EmitA64ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx);
+void EmitA32CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end);
+void EmitA64CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp
new file mode 100644
index 0000000000..909dde731b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp
@@ -0,0 +1,707 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/interface/halt_reason.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+oaknut::Label EmitA32Cond(oaknut::CodeGenerator& code, EmitContext&, IR::Cond cond) {
+ oaknut::Label pass;
+ // TODO: Flags in host flags
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+ code.B(static_cast<oaknut::Cond>(cond), pass);
+ return pass;
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step);
+
+void EmitA32Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret, IR::LocationDescriptor, bool) {
+ ASSERT_FALSE("Interpret should never be emitted.");
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+static void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) {
+ auto get_upper = [](const IR::LocationDescriptor& desc) -> u32 {
+ return static_cast<u32>(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32);
+ };
+
+ const u32 old_upper = get_upper(old_location);
+ const u32 new_upper = [&] {
+ const u32 mask = ~u32(ctx.conf.always_little_endian ? 0x2 : 0);
+ return get_upper(new_location) & mask;
+ }();
+
+ if (old_upper != new_upper) {
+ code.MOV(Wscratch0, new_upper);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ }
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ EmitSetUpperLocationDescriptor(code, ctx, terminal.next, initial_location);
+
+ oaknut::Label fail;
+
+ if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
+ if (ctx.conf.enable_cycle_counting) {
+ code.CMP(Xticks, 0);
+ code.B(LE, fail);
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ } else {
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, fail);
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ }
+ }
+
+ code.l(fail);
+ code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ EmitSetUpperLocationDescriptor(code, ctx, terminal.next, initial_location);
+
+ if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ }
+
+ code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC());
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
+ if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) {
+ oaknut::Label fail;
+
+ code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+ code.AND(Wscratch2, Wscratch2, RSBIndexMask);
+ code.ADD(X2, SP, Xscratch2);
+ code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry));
+ code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+
+ code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb));
+
+ static_assert(offsetof(A32JitState, regs) + 16 * sizeof(u32) == offsetof(A32JitState, upper_location_descriptor));
+ code.LDUR(X0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32));
+
+ code.CMP(X0, Xscratch0);
+ code.B(NE, fail);
+ code.BR(Xscratch1);
+
+ code.l(fail);
+ }
+
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) {
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+
+ // TODO: Implement FastDispatchHint optimization
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label pass = EmitA32Cond(code, ctx, terminal.if_);
+ EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+ code.l(pass);
+ EmitA32Terminal(code, ctx, terminal.then_, initial_location, is_single_step);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label fail;
+ code.LDRB(Wscratch0, SP, offsetof(StackLayout, check_bit));
+ code.CBZ(Wscratch0, fail);
+ EmitA32Terminal(code, ctx, terminal.then_, initial_location, is_single_step);
+ code.l(fail);
+ EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label fail;
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, fail);
+ EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+ code.l(fail);
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ boost::apply_visitor([&](const auto& t) { EmitA32Terminal(code, ctx, t, initial_location, is_single_step); }, terminal);
+}
+
+void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx) {
+ const A32::LocationDescriptor location{ctx.block.Location()};
+ EmitA32Terminal(code, ctx, ctx.block.GetTerminal(), location.SetSingleStepping(false), location.SingleStepping());
+}
+
+void EmitA32ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx) {
+ const A32::LocationDescriptor location{ctx.block.Location()};
+ EmitA32Terminal(code, ctx, IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, location.SetSingleStepping(false), location.SingleStepping());
+}
+
+void EmitA32CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end) {
+ if (!ctx.conf.check_halt_on_memory_access) {
+ return;
+ }
+
+ const A32::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
+
+ code.LDAR(Xscratch0, Xhalt);
+ code.TST(Xscratch0, static_cast<u32>(HaltReason::MemoryAbort));
+ code.B(EQ, end);
+ EmitSetUpperLocationDescriptor(code, ctx, current_location, ctx.block.Location());
+ code.MOV(Wscratch0, current_location.PC());
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15);
+ EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCheckBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsImmediate()) {
+ if (args[0].GetImmediateU1()) {
+ code.MOV(Wscratch0, 1);
+ code.STRB(Wscratch0, SP, offsetof(StackLayout, check_bit));
+ } else {
+ code.STRB(WZR, SP, offsetof(StackLayout, check_bit));
+ }
+ } else {
+ auto Wbit = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wbit);
+ code.STRB(Wbit, SP, offsetof(StackLayout, check_bit));
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetRegister>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wresult);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.LDR(Wresult, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetExtendedRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsSingleExtReg(reg));
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0);
+
+ auto Sresult = ctx.reg_alloc.WriteS(inst);
+ RegAlloc::Realize(Sresult);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.LDR(Sresult, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u32) * index);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+
+ if (A32::IsDoubleExtReg(reg)) {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ RegAlloc::Realize(Dresult);
+ code.LDR(Dresult, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u64) * index);
+ } else {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Qresult);
+ code.LDR(Qresult, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u64) * index);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetExtendedRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg));
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ RegAlloc::Realize(Dresult);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.LDR(Dresult, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u32) * index);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetRegister>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wvalue = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wvalue);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.STR(Wvalue, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetExtendedRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsSingleExtReg(reg));
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Svalue = ctx.reg_alloc.ReadS(args[1]);
+ RegAlloc::Realize(Svalue);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.STR(Svalue, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u32) * index);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetExtendedRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg));
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Dvalue = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Dvalue);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.STR(Dvalue, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u32) * index);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (A32::IsDoubleExtReg(reg)) {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+ auto Dvalue = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Dvalue);
+ code.STR(Dvalue, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u64) * index);
+ } else {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0);
+ auto Qvalue = ctx.reg_alloc.ReadQ(args[1]);
+ RegAlloc::Realize(Qvalue);
+ code.STR(Qvalue, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u64) * index);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetCpsr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wcpsr = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wcpsr);
+
+ static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q));
+
+ code.LDP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.LDR(Wcpsr, Xstate, offsetof(A32JitState, cpsr_jaifm));
+ code.ORR(Wcpsr, Wcpsr, Wscratch0);
+ code.ORR(Wcpsr, Wcpsr, Wscratch1);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_ge));
+ code.AND(Wscratch0, Wscratch0, 0x80808080);
+ code.MOV(Wscratch1, 0x00204081);
+ code.MUL(Wscratch0, Wscratch0, Wscratch1);
+ code.AND(Wscratch0, Wscratch0, 0xf0000000);
+ code.ORR(Wcpsr, Wcpsr, Wscratch0, LSR, 12);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.AND(Wscratch0, Wscratch0, 0b11);
+ // 9 8 7 6 5
+ // E T
+ code.ORR(Wscratch0, Wscratch0, Wscratch0, LSL, 3);
+ code.AND(Wscratch0, Wscratch0, 0x11111111);
+ code.ORR(Wcpsr, Wcpsr, Wscratch0, LSL, 5);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wcpsr = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wcpsr);
+
+ // NZCV, Q flags
+ code.AND(Wscratch0, Wcpsr, 0xF0000000);
+ code.AND(Wscratch1, Wcpsr, 1 << 27);
+
+ static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q));
+ code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv));
+
+ // GE flags
+ // this does the following:
+ // cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0;
+ // cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0;
+ // cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0;
+ // cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0;
+ code.UBFX(Wscratch0, Wcpsr, 16, 4);
+ code.MOV(Wscratch1, 0x00204081);
+ code.MUL(Wscratch0, Wscratch0, Wscratch1);
+ code.AND(Wscratch0, Wscratch0, 0x01010101);
+ code.LSL(Wscratch1, Wscratch0, 8);
+ code.SUB(Wscratch0, Wscratch1, Wscratch0);
+
+ // Other flags
+ code.MOV(Wscratch1, 0x010001DF);
+ code.AND(Wscratch1, Wcpsr, Wscratch1);
+
+ static_assert(offsetof(A32JitState, cpsr_jaifm) + sizeof(u32) == offsetof(A32JitState, cpsr_ge));
+ code.STP(Wscratch1, Wscratch0, Xstate, offsetof(A32JitState, cpsr_jaifm));
+
+ // IT state
+ code.AND(Wscratch0, Wcpsr, 0xFC00);
+ code.LSR(Wscratch1, Wcpsr, 17);
+ code.AND(Wscratch1, Wscratch1, 0x300);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+
+ // E flag, T flag
+ code.LSR(Wscratch1, Wcpsr, 8);
+ code.AND(Wscratch1, Wscratch1, 0x2);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+ code.LDR(Wscratch1, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.BFXIL(Wscratch0, Wcpsr, 5, 1);
+ code.AND(Wscratch1, Wscratch1, 0xFFFF0000);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ code.STR(Wnzcv, Xstate, offsetof(A32JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsrNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ code.STR(Wnzcv, Xstate, offsetof(A32JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsrNZCVQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q));
+
+ code.AND(Wscratch0, Wnzcv, 0xf000'0000);
+ code.AND(Wscratch1, Wnzcv, 0x0800'0000);
+ code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsrNZ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wnz = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnz);
+
+ // TODO: Track latent value
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wscratch0, Wscratch0, 0x30000000);
+ code.ORR(Wscratch0, Wscratch0, Wnz);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetCpsrNZC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ // TODO: Track latent value
+
+ if (args[0].IsImmediate()) {
+ if (args[1].IsImmediate()) {
+ const u32 carry = args[1].GetImmediateU1() ? 0x2000'0000 : 0;
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wscratch0, Wscratch0, 0x10000000);
+ if (carry) {
+ code.ORR(Wscratch0, Wscratch0, carry);
+ }
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ } else {
+ auto Wc = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wc);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wscratch0, Wscratch0, 0x10000000);
+ code.ORR(Wscratch0, Wscratch0, Wc);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ }
+ } else {
+ if (args[1].IsImmediate()) {
+ const u32 carry = args[1].GetImmediateU1() ? 0x2000'0000 : 0;
+ auto Wnz = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnz);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wscratch0, Wscratch0, 0x10000000);
+ code.ORR(Wscratch0, Wscratch0, Wnz);
+ if (carry) {
+ code.ORR(Wscratch0, Wscratch0, carry);
+ }
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ } else {
+ auto Wnz = ctx.reg_alloc.ReadW(args[0]);
+ auto Wc = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wnz, Wc);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wscratch0, Wscratch0, 0x10000000);
+ code.ORR(Wscratch0, Wscratch0, Wnz);
+ code.ORR(Wscratch0, Wscratch0, Wc);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetCFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wflag = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wflag);
+
+ code.LDR(Wflag, Xstate, offsetof(A32JitState, cpsr_nzcv));
+ code.AND(Wflag, Wflag, 1 << 29);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32OrQFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wflag = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wflag);
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_q));
+ code.ORR(Wscratch0, Wscratch0, Wflag, LSL, 27);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_q));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetGEFlags>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Snzcv = ctx.reg_alloc.WriteS(inst);
+ RegAlloc::Realize(Snzcv);
+
+ code.LDR(Snzcv, Xstate, offsetof(A32JitState, cpsr_ge));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetGEFlags>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Snzcv = ctx.reg_alloc.ReadS(args[0]);
+ RegAlloc::Realize(Snzcv);
+
+ code.STR(Snzcv, Xstate, offsetof(A32JitState, cpsr_ge));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetGEFlagsCompressed>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wge = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wge);
+
+ code.LSR(Wscratch0, Wge, 16);
+ code.MOV(Wscratch1, 0x00204081);
+ code.MUL(Wscratch0, Wscratch0, Wscratch1);
+ code.AND(Wscratch0, Wscratch0, 0x01010101);
+ code.LSL(Wscratch1, Wscratch0, 8);
+ code.SUB(Wscratch0, Wscratch1, Wscratch0);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_ge));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32BXWritePC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const u32 upper_without_t = (A32::LocationDescriptor{ctx.block.EndLocation()}.SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE;
+
+ static_assert(offsetof(A32JitState, regs) + 16 * sizeof(u32) == offsetof(A32JitState, upper_location_descriptor));
+
+ if (args[0].IsImmediate()) {
+ const u32 new_pc = args[0].GetImmediateU32();
+ const u32 mask = mcl::bit::get_bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
+ const u32 new_upper = upper_without_t | (mcl::bit::get_bit<0>(new_pc) ? 1 : 0);
+
+ code.MOV(Xscratch0, (u64{new_upper} << 32) | (new_pc & mask));
+ code.STUR(Xscratch0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32));
+ } else {
+ auto Wpc = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wpc);
+ ctx.reg_alloc.SpillFlags();
+
+ code.ANDS(Wscratch0, Wpc, 1);
+ code.MOV(Wscratch1, 3);
+ code.CSEL(Wscratch1, Wscratch0, Wscratch1, NE);
+ code.BIC(Wscratch1, Wpc, Wscratch1);
+ code.MOV(Wscratch0, upper_without_t);
+ code.CINC(Wscratch0, Wscratch0, NE);
+ code.STP(Wscratch1, Wscratch0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32));
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32UpdateUpperLocationDescriptor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) {
+ for (auto& inst : ctx.block) {
+ if (inst.GetOpcode() == IR::Opcode::A32BXWritePC) {
+ return;
+ }
+ }
+ EmitSetUpperLocationDescriptor(code, ctx, ctx.block.EndLocation(), ctx.block.Location());
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall();
+
+ if (ctx.conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ EmitRelocation(code, ctx, LinkTarget::AddTicks);
+ }
+
+ code.MOV(W1, args[0].GetImmediateU32());
+ EmitRelocation(code, ctx, LinkTarget::CallSVC);
+
+ if (ctx.conf.enable_cycle_counting) {
+ EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
+ code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
+ code.MOV(Xticks, X0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall();
+
+ if (ctx.conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ EmitRelocation(code, ctx, LinkTarget::AddTicks);
+ }
+
+ code.MOV(W1, args[0].GetImmediateU32());
+ code.MOV(W2, args[1].GetImmediateU32());
+ EmitRelocation(code, ctx, LinkTarget::ExceptionRaised);
+
+ if (ctx.conf.enable_cycle_counting) {
+ EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
+ code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
+ code.MOV(Xticks, X0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32DataSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.DSB(oaknut::BarrierOp::SY);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32DataMemoryBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.DMB(oaknut::BarrierOp::SY);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32InstructionSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) {
+ if (!ctx.conf.hook_isb) {
+ return;
+ }
+
+ ctx.reg_alloc.PrepareForCall();
+ EmitRelocation(code, ctx, LinkTarget::InstructionSynchronizationBarrierRaised);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetFpscr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wfpscr = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wfpscr);
+ ctx.fpsr.Spill();
+
+ static_assert(offsetof(A32JitState, fpsr) + sizeof(u32) == offsetof(A32JitState, fpsr_nzcv));
+
+ code.LDR(Wfpscr, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.LDP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, fpsr));
+ code.AND(Wfpscr, Wfpscr, 0xffff'0000);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+ code.ORR(Wfpscr, Wfpscr, Wscratch0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetFpscr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wfpscr = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wfpscr);
+ ctx.fpsr.Overwrite();
+
+ static_assert(offsetof(A32JitState, fpsr) + sizeof(u32) == offsetof(A32JitState, fpsr_nzcv));
+
+ code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+ code.MOV(Wscratch1, 0x07f7'0000);
+ code.AND(Wscratch1, Wfpscr, Wscratch1);
+ code.AND(Wscratch0, Wscratch0, 0x0000'ffff);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+ code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor));
+
+ code.MOV(Wscratch0, 0x0800'009f);
+ code.AND(Wscratch0, Wfpscr, Wscratch0);
+ code.AND(Wscratch1, Wfpscr, 0xf000'0000);
+ code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, fpsr));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32GetFpscrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wnzcv = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wnzcv);
+
+ code.LDR(Wnzcv, Xstate, offsetof(A32JitState, fpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32SetFpscrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ code.STR(Wnzcv, Xstate, offsetof(A32JitState, fpsr_nzcv));
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp
new file mode 100644
index 0000000000..5115fbbbdb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp
@@ -0,0 +1,299 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/interface/A32/coprocessor.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+static void EmitCoprocessorException() {
+ ASSERT_FALSE("Should raise coproc exception here");
+}
+
+static void CallCoprocCallback(oaknut::CodeGenerator& code, EmitContext& ctx, A32::Coprocessor::Callback callback, IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) {
+ ctx.reg_alloc.PrepareForCall({}, arg0, arg1);
+
+ if (callback.user_arg) {
+ code.MOV(X0, reinterpret_cast<u64>(*callback.user_arg));
+ }
+
+ code.MOV(Xscratch0, reinterpret_cast<u64>(callback.function));
+ code.BLR(Xscratch0);
+
+ if (inst) {
+ ctx.reg_alloc.DefineAsRegister(inst, X0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocInternalOperation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[5]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[6]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx, *action);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocSendOneWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[5]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx, *cb, nullptr, args[1]);
+ return;
+ }
+
+ if (const auto destination_ptr = std::get_if<u32*>(&action)) {
+ auto Wvalue = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wvalue);
+
+ code.MOV(Xscratch0, reinterpret_cast<u64>(*destination_ptr));
+ code.STR(Wvalue, Xscratch0);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocSendTwoWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc = static_cast<unsigned>(coproc_info[2]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileSendTwoWords(two, opc, CRm);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx, *cb, nullptr, args[1], args[2]);
+ return;
+ }
+
+ if (const auto destination_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
+ auto Wvalue1 = ctx.reg_alloc.ReadW(args[1]);
+ auto Wvalue2 = ctx.reg_alloc.ReadW(args[2]);
+ RegAlloc::Realize(Wvalue1, Wvalue2);
+
+ code.MOV(Xscratch0, reinterpret_cast<u64>((*destination_ptrs)[0]));
+ code.MOV(Xscratch1, reinterpret_cast<u64>((*destination_ptrs)[1]));
+ code.STR(Wvalue1, Xscratch0);
+ code.STR(Wvalue2, Xscratch1);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocGetOneWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[5]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx, *cb, inst);
+ return;
+ }
+
+ if (const auto source_ptr = std::get_if<u32*>(&action)) {
+ auto Wvalue = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wvalue);
+
+ code.MOV(Xscratch0, reinterpret_cast<u64>(*source_ptr));
+ code.LDR(Wvalue, Xscratch0);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocGetTwoWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const unsigned opc = coproc_info[2];
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ auto action = coproc->CompileGetTwoWords(two, opc, CRm);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx, *cb, inst);
+ return;
+ }
+
+ if (const auto source_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
+ auto Xvalue = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xvalue);
+
+ code.MOV(Xscratch0, reinterpret_cast<u64>((*source_ptrs)[0]));
+ code.MOV(Xscratch1, reinterpret_cast<u64>((*source_ptrs)[1]));
+ code.LDR(Xvalue, Xscratch0);
+ code.LDR(Wscratch1, Xscratch1);
+ code.BFI(Xvalue, Xscratch1, 32, 32);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocLoadWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const bool long_transfer = coproc_info[2] != 0;
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const bool has_option = coproc_info[4] != 0;
+
+ std::optional<u8> option = std::nullopt;
+ if (has_option) {
+ option = coproc_info[5];
+ }
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx, *action, nullptr, args[1]);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32CoprocStoreWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const bool long_transfer = coproc_info[2] != 0;
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const bool has_option = coproc_info[4] != 0;
+
+ std::optional<u8> option = std::nullopt;
+ if (has_option) {
+ option = coproc_info[5];
+ }
+
+ std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx, *action, nullptr, args[1]);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp
new file mode 100644
index 0000000000..9e8426e230
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp
@@ -0,0 +1,107 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_arm64_memory.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<>
+void EmitIR<IR::Opcode::A32ClearExclusive>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.STR(WZR, Xstate, offsetof(A32JitState, exclusive_state));
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32WriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32WriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32WriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32WriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveWriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveWriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveWriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A32ExclusiveWriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<64>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp
new file mode 100644
index 0000000000..67fd91f26a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp
@@ -0,0 +1,518 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit_cast.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/interface/halt_reason.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+oaknut::Label EmitA64Cond(oaknut::CodeGenerator& code, EmitContext&, IR::Cond cond) {
+ oaknut::Label pass;
+ // TODO: Flags in host flags
+ code.LDR(Wscratch0, Xstate, offsetof(A64JitState, cpsr_nzcv));
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+ code.B(static_cast<oaknut::Cond>(cond), pass);
+ return pass;
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step);
+
+void EmitA64Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret, IR::LocationDescriptor, bool) {
+ ASSERT_FALSE("Interpret should never be emitted.");
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) {
+ oaknut::Label fail;
+
+ if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
+ if (ctx.conf.enable_cycle_counting) {
+ code.CMP(Xticks, 0);
+ code.B(LE, fail);
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ } else {
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, fail);
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ }
+ }
+
+ code.l(fail);
+ code.MOV(Xscratch0, A64::LocationDescriptor{terminal.next}.PC());
+ code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc));
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) {
+ if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) {
+ EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch);
+ }
+
+ code.MOV(Xscratch0, A64::LocationDescriptor{terminal.next}.PC());
+ code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc));
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
+ if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) {
+ oaknut::Label fail;
+
+ code.MOV(Wscratch0, A64::LocationDescriptor::fpcr_mask);
+ code.LDR(W0, Xstate, offsetof(A64JitState, fpcr));
+ code.LDR(X1, Xstate, offsetof(A64JitState, pc));
+ code.AND(W0, W0, Wscratch0);
+ code.AND(X1, X1, A64::LocationDescriptor::pc_mask);
+ code.LSL(X0, X0, A64::LocationDescriptor::fpcr_shift);
+ code.ORR(X0, X0, X1);
+
+ code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+ code.AND(Wscratch2, Wscratch2, RSBIndexMask);
+ code.ADD(X2, SP, Xscratch2);
+ code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry));
+ code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr));
+
+ code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb));
+
+ code.CMP(X0, Xscratch0);
+ code.B(NE, fail);
+ code.BR(Xscratch1);
+
+ code.l(fail);
+ }
+
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) {
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+
+ // TODO: Implement FastDispatchHint optimization
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label pass = EmitA64Cond(code, ctx, terminal.if_);
+ EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+ code.l(pass);
+ EmitA64Terminal(code, ctx, terminal.then_, initial_location, is_single_step);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label fail;
+ code.LDRB(Wscratch0, SP, offsetof(StackLayout, check_bit));
+ code.CBZ(Wscratch0, fail);
+ EmitA64Terminal(code, ctx, terminal.then_, initial_location, is_single_step);
+ code.l(fail);
+ EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ oaknut::Label fail;
+ code.LDAR(Wscratch0, Xhalt);
+ code.CBNZ(Wscratch0, fail);
+ EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step);
+ code.l(fail);
+ EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ boost::apply_visitor([&](const auto& t) { EmitA64Terminal(code, ctx, t, initial_location, is_single_step); }, terminal);
+}
+
+void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx) {
+ const A64::LocationDescriptor location{ctx.block.Location()};
+ EmitA64Terminal(code, ctx, ctx.block.GetTerminal(), location.SetSingleStepping(false), location.SingleStepping());
+}
+
+void EmitA64ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx) {
+ const A64::LocationDescriptor location{ctx.block.Location()};
+ EmitA64Terminal(code, ctx, IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, location.SetSingleStepping(false), location.SingleStepping());
+}
+
+void EmitA64CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end) {
+ if (!ctx.conf.check_halt_on_memory_access) {
+ return;
+ }
+
+ const A64::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
+
+ code.LDAR(Xscratch0, Xhalt);
+ code.TST(Xscratch0, static_cast<u32>(HaltReason::MemoryAbort));
+ code.B(EQ, end);
+ code.MOV(Xscratch0, current_location.PC());
+ code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc));
+ EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetCheckBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsImmediate()) {
+ if (args[0].GetImmediateU1()) {
+ code.MOV(Wscratch0, 1);
+ code.STRB(Wscratch0, SP, offsetof(StackLayout, check_bit));
+ } else {
+ code.STRB(WZR, SP, offsetof(StackLayout, check_bit));
+ }
+ } else {
+ auto Wbit = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wbit);
+ code.STRB(Wbit, SP, offsetof(StackLayout, check_bit));
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetCFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wflag = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wflag);
+ code.LDR(Wflag, Xstate, offsetof(A64JitState, cpsr_nzcv));
+ code.AND(Wflag, Wflag, 1 << 29);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wnzcv = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wnzcv);
+
+ code.LDR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ code.STR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wnzcv = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wnzcv);
+
+ code.STR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetW>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wresult);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.LDR(Wresult, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetX>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xresult);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.LDR(Xresult, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetS>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Sresult = ctx.reg_alloc.WriteS(inst);
+ RegAlloc::Realize(Sresult);
+ code.LDR(Sresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetD>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ RegAlloc::Realize(Dresult);
+ code.LDR(Dresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Qresult);
+ code.LDR(Qresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetSP>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xresult);
+
+ code.LDR(Xresult, Xstate, offsetof(A64JitState, sp));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetFPCR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wresult);
+
+ code.LDR(Wresult, Xstate, offsetof(A64JitState, fpcr));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetFPSR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wresult);
+
+ code.LDR(Wresult, Xstate, offsetof(A64JitState, fpsr));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetW>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wvalue = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wvalue);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+ code.MOV(*Wvalue, Wvalue);
+ code.STR(Wvalue->toX(), Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetX>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Xvalue = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xvalue);
+
+ // TODO: Detect if Gpr vs Fpr is more appropriate
+
+ code.STR(Xvalue, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetS>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Svalue = ctx.reg_alloc.ReadS(args[1]);
+ RegAlloc::Realize(Svalue);
+
+ code.FMOV(Svalue, Svalue);
+ code.STR(Svalue->toQ(), Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetD>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Dvalue = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Dvalue);
+
+ code.FMOV(Dvalue, Dvalue);
+ code.STR(Dvalue->toQ(), Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ auto Qvalue = ctx.reg_alloc.ReadQ(args[1]);
+ RegAlloc::Realize(Qvalue);
+ code.STR(Qvalue, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetSP>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xvalue);
+ code.STR(Xvalue, Xstate, offsetof(A64JitState, sp));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetFPCR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wvalue = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wvalue);
+ code.STR(Wvalue, Xstate, offsetof(A64JitState, fpcr));
+ code.MSR(oaknut::SystemReg::FPCR, Wvalue->toX());
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetFPSR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wvalue = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wvalue);
+ code.STR(Wvalue, Xstate, offsetof(A64JitState, fpsr));
+ code.MSR(oaknut::SystemReg::FPSR, Wvalue->toX());
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetPC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xvalue);
+ code.STR(Xvalue, Xstate, offsetof(A64JitState, pc));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64CallSupervisor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall();
+
+ if (ctx.conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ EmitRelocation(code, ctx, LinkTarget::AddTicks);
+ }
+
+ code.MOV(W1, args[0].GetImmediateU32());
+ EmitRelocation(code, ctx, LinkTarget::CallSVC);
+
+ if (ctx.conf.enable_cycle_counting) {
+ EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
+ code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
+ code.MOV(Xticks, X0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExceptionRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall();
+
+ if (ctx.conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ EmitRelocation(code, ctx, LinkTarget::AddTicks);
+ }
+
+ code.MOV(X1, args[0].GetImmediateU64());
+ code.MOV(X2, args[1].GetImmediateU64());
+ EmitRelocation(code, ctx, LinkTarget::ExceptionRaised);
+
+ if (ctx.conf.enable_cycle_counting) {
+ EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
+ code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
+ code.MOV(Xticks, X0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::A64DataCacheOperationRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[1], args[2]);
+ EmitRelocation(code, ctx, LinkTarget::DataCacheOperationRaised);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64InstructionCacheOperationRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[0], args[1]);
+ EmitRelocation(code, ctx, LinkTarget::InstructionCacheOperationRaised);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64DataSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.DSB(oaknut::BarrierOp::SY);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64DataMemoryBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.DMB(oaknut::BarrierOp::SY);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64InstructionSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) {
+ if (!ctx.conf.hook_isb) {
+ return;
+ }
+
+ ctx.reg_alloc.PrepareForCall();
+ EmitRelocation(code, ctx, LinkTarget::InstructionSynchronizationBarrierRaised);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetCNTFRQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Xvalue = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xvalue);
+ code.MOV(Xvalue, ctx.conf.cntfreq_el0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetCNTPCT>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.PrepareForCall();
+ if (!ctx.conf.wall_clock_cntpct && ctx.conf.enable_cycle_counting) {
+ code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run));
+ code.SUB(X1, X1, Xticks);
+ EmitRelocation(code, ctx, LinkTarget::AddTicks);
+ EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining);
+ code.STR(X0, SP, offsetof(StackLayout, cycles_to_run));
+ code.MOV(Xticks, X0);
+ }
+ EmitRelocation(code, ctx, LinkTarget::GetCNTPCT);
+ ctx.reg_alloc.DefineAsRegister(inst, X0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetCTR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wvalue = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wvalue);
+ code.MOV(Wvalue, ctx.conf.ctr_el0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetDCZID>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Wvalue = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Wvalue);
+ code.MOV(Wvalue, ctx.conf.dczid_el0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetTPIDR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Xvalue = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xvalue);
+ code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidr_el0));
+ code.LDR(Xvalue, Xscratch0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64GetTPIDRRO>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Xvalue = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Xvalue);
+ code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidrro_el0));
+ code.LDR(Xvalue, Xscratch0);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64SetTPIDR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xvalue);
+ code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidr_el0));
+ code.STR(Xvalue, Xscratch0);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp
new file mode 100644
index 0000000000..099c81a4c3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp
@@ -0,0 +1,128 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_arm64_memory.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/acc_type.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<>
+void EmitIR<IR::Opcode::A64ClearExclusive>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) {
+ code.STR(WZR, Xstate, offsetof(A64JitState, exclusive_state));
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ReadMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReadMemory<128>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveReadMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveReadMemory<128>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64WriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64WriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64WriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64WriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64WriteMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitWriteMemory<128>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveWriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<8>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveWriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<16>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveWriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveWriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::A64ExclusiveWriteMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitExclusiveWriteMemory<128>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp
new file mode 100644
index 0000000000..b0a3cb4817
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp
@@ -0,0 +1,166 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<size_t bitsize, typename EmitFn>
+static void EmitCRC(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit_fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Woutput = ctx.reg_alloc.WriteW(inst);
+ auto Winput = ctx.reg_alloc.ReadW(args[0]);
+ auto Rdata = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Woutput, Winput, Rdata);
+
+ emit_fn(Woutput, Winput, Rdata);
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32Castagnoli8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CB(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32Castagnoli16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CH(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32Castagnoli32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CW(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32Castagnoli64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<64>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Xdata) { code.CRC32CX(Woutput, Winput, Xdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32ISO8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32B(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32ISO16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32H(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32ISO32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32W(Woutput, Winput, Wdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CRC32ISO64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC<64>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Xdata) { code.CRC32X(Woutput, Winput, Xdata); });
+}
+
+template<>
+void EmitIR<IR::Opcode::AESDecryptSingleRound>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qoutput = ctx.reg_alloc.WriteQ(inst);
+ auto Qinput = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qoutput, Qinput);
+
+ code.MOVI(Qoutput->toD(), oaknut::RepImm{0});
+ code.AESD(Qoutput->B16(), Qinput->B16());
+}
+
+template<>
+void EmitIR<IR::Opcode::AESEncryptSingleRound>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qoutput = ctx.reg_alloc.WriteQ(inst);
+ auto Qinput = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qoutput, Qinput);
+
+ code.MOVI(Qoutput->toD(), oaknut::RepImm{0});
+ code.AESE(Qoutput->B16(), Qinput->B16());
+}
+
+template<>
+void EmitIR<IR::Opcode::AESInverseMixColumns>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qoutput = ctx.reg_alloc.WriteQ(inst);
+ auto Qinput = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qoutput, Qinput);
+
+ code.AESIMC(Qoutput->B16(), Qinput->B16());
+}
+
+template<>
+void EmitIR<IR::Opcode::AESMixColumns>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qoutput = ctx.reg_alloc.WriteQ(inst);
+ auto Qinput = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qoutput, Qinput);
+
+ code.AESMC(Qoutput->B16(), Qinput->B16());
+}
+
+template<>
+void EmitIR<IR::Opcode::SM4AccessSubstitutionBox>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SHA256Hash>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool part1 = args[3].GetImmediateU1();
+
+ if (part1) {
+ auto Qx = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Qy = ctx.reg_alloc.ReadQ(args[1]);
+ auto Qz = ctx.reg_alloc.ReadQ(args[2]);
+ RegAlloc::Realize(Qx, Qy, Qz);
+
+ code.SHA256H(Qx, Qy, Qz->S4());
+ } else {
+ auto Qx = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qy = ctx.reg_alloc.ReadWriteQ(args[1], inst);
+ auto Qz = ctx.reg_alloc.ReadQ(args[2]);
+ RegAlloc::Realize(Qx, Qy, Qz);
+
+ code.SHA256H2(Qy, Qx, Qz->S4()); // Yes x and y are swapped
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::SHA256MessageSchedule0>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qa = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ RegAlloc::Realize(Qa, Qb);
+
+ code.SHA256SU0(Qa->S4(), Qb->S4());
+}
+
+template<>
+void EmitIR<IR::Opcode::SHA256MessageSchedule1>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qa = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ auto Qc = ctx.reg_alloc.ReadQ(args[2]);
+ RegAlloc::Realize(Qa, Qb, Qc);
+
+ code.SHA256SU1(Qa->S4(), Qb->S4(), Qc->S4());
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp
new file mode 100644
index 0000000000..04395148c8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp
@@ -0,0 +1,1523 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstddef>
+
+#include <fmt/ostream.h>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<size_t bitsize, typename EmitFn>
+static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst);
+ auto Roperand = ctx.reg_alloc.ReadReg<bitsize>(args[0]);
+ RegAlloc::Realize(Rresult, Roperand);
+
+ emit(Rresult, Roperand);
+}
+
+template<size_t bitsize, typename EmitFn>
+static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst);
+ auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]);
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Rresult, Ra, Rb);
+
+ emit(Rresult, Ra, Rb);
+}
+
+template<>
+void EmitIR<IR::Opcode::Pack2x32To1x64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wlo = ctx.reg_alloc.ReadW(args[0]);
+ auto Whi = ctx.reg_alloc.ReadW(args[1]);
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ RegAlloc::Realize(Wlo, Whi, Xresult);
+
+ code.MOV(Xresult->toW(), Wlo); // TODO: Move eliminiation
+ code.BFI(Xresult, Whi->toX(), 32, 32);
+}
+
+template<>
+void EmitIR<IR::Opcode::Pack2x64To1x128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsInGpr() && args[1].IsInGpr()) {
+ auto Xlo = ctx.reg_alloc.ReadX(args[0]);
+ auto Xhi = ctx.reg_alloc.ReadX(args[1]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Xlo, Xhi, Qresult);
+
+ code.FMOV(Qresult->toD(), Xlo);
+ code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], Xhi);
+ } else if (args[0].IsInGpr()) {
+ auto Xlo = ctx.reg_alloc.ReadX(args[0]);
+ auto Dhi = ctx.reg_alloc.ReadD(args[1]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Xlo, Dhi, Qresult);
+
+ code.FMOV(Qresult->toD(), Xlo);
+ code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], oaknut::VRegSelector{Dhi->index()}.D()[0]);
+ } else if (args[1].IsInGpr()) {
+ auto Dlo = ctx.reg_alloc.ReadD(args[0]);
+ auto Xhi = ctx.reg_alloc.ReadX(args[1]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Dlo, Xhi, Qresult);
+
+ code.FMOV(Qresult->toD(), Dlo); // TODO: Move eliminiation
+ code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], Xhi);
+ } else {
+ auto Dlo = ctx.reg_alloc.ReadD(args[0]);
+ auto Dhi = ctx.reg_alloc.ReadD(args[1]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Dlo, Dhi, Qresult);
+
+ code.FMOV(Qresult->toD(), Dlo); // TODO: Move eliminiation
+ code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], oaknut::VRegSelector{Dhi->index()}.D()[0]);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::LeastSignificantWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Wresult, Xoperand);
+
+ code.MOV(Wresult, Xoperand->toW()); // TODO: Zext elimination
+}
+
+template<>
+void EmitIR<IR::Opcode::LeastSignificantHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.UXTH(Wresult, Woperand); // TODO: Zext elimination
+}
+
+template<>
+void EmitIR<IR::Opcode::LeastSignificantByte>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.UXTB(Wresult, Woperand); // TODO: Zext elimination
+}
+
+template<>
+void EmitIR<IR::Opcode::MostSignificantWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Wresult, Xoperand);
+
+ code.LSR(Wresult->toX(), Xoperand, 32);
+
+ if (carry_inst) {
+ auto Wcarry = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wcarry);
+
+ code.LSR(Wcarry, Xoperand->toW(), 31 - 29);
+ code.AND(Wcarry, Wcarry, 1 << 29);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::MostSignificantBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.LSR(Wresult, Woperand, 31);
+}
+
+template<>
+void EmitIR<IR::Opcode::IsZero32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wresult, Woperand);
+ ctx.reg_alloc.SpillFlags();
+
+ code.CMP(Woperand, 0);
+ code.CSET(Wresult, EQ);
+}
+
+template<>
+void EmitIR<IR::Opcode::IsZero64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Wresult, Xoperand);
+ ctx.reg_alloc.SpillFlags();
+
+ code.CMP(Xoperand, 0);
+ code.CSET(Wresult, EQ);
+}
+
+template<>
+void EmitIR<IR::Opcode::TestBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xresult, Xoperand);
+ ASSERT(args[1].IsImmediate());
+ ASSERT(args[1].GetImmediateU8() < 64);
+
+ code.UBFX(Xresult, Xoperand, args[1].GetImmediateU8(), 1);
+}
+
+template<>
+void EmitIR<IR::Opcode::ConditionalSelect32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const IR::Cond cond = args[0].GetImmediateCond();
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wthen = ctx.reg_alloc.ReadW(args[1]);
+ auto Welse = ctx.reg_alloc.ReadW(args[2]);
+ RegAlloc::Realize(Wresult, Wthen, Welse);
+ ctx.reg_alloc.SpillFlags();
+
+ // TODO: FSEL for fprs
+
+ code.LDR(Wscratch0, Xstate, ctx.conf.state_nzcv_offset);
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+ code.CSEL(Wresult, Wthen, Welse, static_cast<oaknut::Cond>(cond));
+}
+
+template<>
+void EmitIR<IR::Opcode::ConditionalSelect64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const IR::Cond cond = args[0].GetImmediateCond();
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xthen = ctx.reg_alloc.ReadX(args[1]);
+ auto Xelse = ctx.reg_alloc.ReadX(args[2]);
+ RegAlloc::Realize(Xresult, Xthen, Xelse);
+ ctx.reg_alloc.SpillFlags();
+
+ // TODO: FSEL for fprs
+
+ code.LDR(Wscratch0, Xstate, ctx.conf.state_nzcv_offset);
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+ code.CSEL(Xresult, Xthen, Xelse, static_cast<oaknut::Cond>(cond));
+}
+
+template<>
+void EmitIR<IR::Opcode::ConditionalSelectNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitIR<IR::Opcode::ConditionalSelect32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ if (shift <= 31) {
+ code.LSL(Wresult, Woperand, shift);
+ } else {
+ code.MOV(Wresult, WZR);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ RegAlloc::Realize(Wresult, Woperand, Wshift);
+ ctx.reg_alloc.SpillFlags();
+
+ code.AND(Wscratch0, Wshift, 0xff);
+ code.LSL(Wresult, Woperand, Wscratch0);
+ code.CMP(Wscratch0, 32);
+ code.CSEL(Wresult, Wresult, WZR, LT);
+ }
+ } else {
+ if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) {
+ ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg);
+ ctx.reg_alloc.DefineAsExisting(inst, operand_arg);
+ } else if (shift_arg.IsImmediate()) {
+ // TODO: Use RMIF
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift < 32) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.UBFX(Wcarry_out, Woperand, 32 - shift, 1);
+ code.LSL(Wcarry_out, Wcarry_out, 29);
+ code.LSL(Wresult, Woperand, shift);
+ } else if (shift > 32) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wresult, Wcarry_out);
+
+ code.MOV(Wresult, WZR);
+ code.MOV(Wcarry_out, WZR);
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.UBFIZ(Wcarry_out, Woperand, 29, 1);
+ code.MOV(Wresult, WZR);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg);
+ if (carry_arg.IsImmediate()) {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift);
+ } else {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in);
+ }
+ ctx.reg_alloc.SpillFlags();
+
+ // TODO: Use RMIF
+
+ oaknut::Label zero, end;
+
+ code.ANDS(Wscratch1, Wshift, 0xff);
+ code.B(EQ, zero);
+
+ code.NEG(Wscratch0, Wshift);
+ code.LSR(Wcarry_out, Woperand, Wscratch0);
+ code.LSL(Wresult, Woperand, Wshift);
+ code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1);
+ code.CMP(Wscratch1, 32);
+ code.CSEL(Wresult, Wresult, WZR, LT);
+ code.CSEL(Wcarry_out, Wcarry_out, WZR, LE);
+ code.B(end);
+
+ code.l(zero);
+ code.MOV(*Wresult, Woperand);
+ if (carry_arg.IsImmediate()) {
+ code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29);
+ } else {
+ code.MOV(*Wcarry_out, Wcarry_in);
+ }
+
+ code.l(end);
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[1].IsImmediate()) {
+ const u8 shift = args[1].GetImmediateU8();
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xresult, Xoperand);
+
+ if (shift <= 63) {
+ code.LSL(Xresult, Xoperand, shift);
+ } else {
+ code.MOV(Xresult, XZR);
+ }
+ } else {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ auto Xshift = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xoperand, Xshift);
+ ctx.reg_alloc.SpillFlags();
+
+ code.AND(Xscratch0, Xshift, 0xff);
+ code.LSL(Xresult, Xoperand, Xscratch0);
+ code.CMP(Xscratch0, 64);
+ code.CSEL(Xresult, Xresult, XZR, LT);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ if (shift <= 31) {
+ code.LSR(Wresult, Woperand, shift);
+ } else {
+ code.MOV(Wresult, WZR);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ RegAlloc::Realize(Wresult, Woperand, Wshift);
+ ctx.reg_alloc.SpillFlags();
+
+ code.AND(Wscratch0, Wshift, 0xff);
+ code.LSR(Wresult, Woperand, Wscratch0);
+ code.CMP(Wscratch0, 32);
+ code.CSEL(Wresult, Wresult, WZR, LT);
+ }
+ } else {
+ if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) {
+ ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg);
+ ctx.reg_alloc.DefineAsExisting(inst, operand_arg);
+ } else if (shift_arg.IsImmediate()) {
+ // TODO: Use RMIF
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift < 32) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.UBFX(Wcarry_out, Woperand, shift - 1, 1);
+ code.LSL(Wcarry_out, Wcarry_out, 29);
+ code.LSR(Wresult, Woperand, shift);
+ } else if (shift > 32) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wresult, Wcarry_out);
+
+ code.MOV(Wresult, WZR);
+ code.MOV(Wcarry_out, WZR);
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.LSR(Wcarry_out, Woperand, 31 - 29);
+ code.AND(Wcarry_out, Wcarry_out, 1 << 29);
+ code.MOV(Wresult, WZR);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg);
+ if (carry_arg.IsImmediate()) {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift);
+ } else {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in);
+ }
+ ctx.reg_alloc.SpillFlags();
+
+ // TODO: Use RMIF
+
+ oaknut::Label zero, end;
+
+ code.ANDS(Wscratch1, Wshift, 0xff);
+ code.B(EQ, zero);
+
+ code.SUB(Wscratch0, Wshift, 1);
+ code.LSR(Wcarry_out, Woperand, Wscratch0);
+ code.LSR(Wresult, Woperand, Wshift);
+ code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1);
+ code.CMP(Wscratch1, 32);
+ code.CSEL(Wresult, Wresult, WZR, LT);
+ code.CSEL(Wcarry_out, Wcarry_out, WZR, LE);
+ code.B(end);
+
+ code.l(zero);
+ code.MOV(*Wresult, Woperand);
+ if (carry_arg.IsImmediate()) {
+ code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29);
+ } else {
+ code.MOV(*Wcarry_out, Wcarry_in);
+ }
+
+ code.l(end);
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[1].IsImmediate()) {
+ const u8 shift = args[1].GetImmediateU8();
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ RegAlloc::Realize(Xresult, Xoperand);
+
+ if (shift <= 63) {
+ code.LSR(Xresult, Xoperand, shift);
+ } else {
+ code.MOV(Xresult, XZR);
+ }
+ } else {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(args[0]);
+ auto Xshift = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xoperand, Xshift);
+ ctx.reg_alloc.SpillFlags();
+
+ code.AND(Xscratch0, Xshift, 0xff);
+ code.LSR(Xresult, Xoperand, Xscratch0);
+ code.CMP(Xscratch0, 64);
+ code.CSEL(Xresult, Xresult, XZR, LT);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::ArithmeticShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.ASR(Wresult, Woperand, shift <= 31 ? shift : 31);
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ RegAlloc::Realize(Wresult, Woperand, Wshift);
+ ctx.reg_alloc.SpillFlags();
+
+ code.AND(Wscratch0, Wshift, 0xff);
+ code.MOV(Wscratch1, 31);
+ code.CMP(Wscratch0, 31);
+ code.CSEL(Wscratch0, Wscratch0, Wscratch1, LS);
+ code.ASR(Wresult, Woperand, Wscratch0);
+ }
+ } else {
+ if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) {
+ ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg);
+ ctx.reg_alloc.DefineAsExisting(inst, operand_arg);
+ } else if (shift_arg.IsImmediate()) {
+ // TODO: Use RMIF
+
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift <= 31) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.UBFX(Wcarry_out, Woperand, shift - 1, 1);
+ code.LSL(Wcarry_out, Wcarry_out, 29);
+ code.ASR(Wresult, Woperand, shift);
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand);
+
+ code.ASR(Wresult, Woperand, 31);
+ code.AND(Wcarry_out, Wresult, 1 << 29);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg);
+ if (carry_arg.IsImmediate()) {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift);
+ } else {
+ RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in);
+ }
+ ctx.reg_alloc.SpillFlags();
+
+ // TODO: Use RMIF
+
+ oaknut::Label zero, end;
+
+ code.ANDS(Wscratch0, Wshift, 0xff);
+ code.B(EQ, zero);
+
+ code.MOV(Wscratch1, 63);
+ code.CMP(Wscratch0, 63);
+ code.CSEL(Wscratch0, Wscratch0, Wscratch1, LS);
+
+ code.SXTW(Wresult->toX(), Woperand);
+ code.SUB(Wscratch1, Wscratch0, 1);
+
+ code.ASR(Wcarry_out->toX(), Wresult->toX(), Xscratch1);
+ code.ASR(Wresult->toX(), Wresult->toX(), Xscratch0);
+
+ code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1);
+ code.MOV(*Wresult, Wresult);
+
+ code.B(end);
+
+ code.l(zero);
+ code.MOV(*Wresult, Woperand);
+ if (carry_arg.IsImmediate()) {
+ code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29);
+ } else {
+ code.MOV(*Wcarry_out, Wcarry_in);
+ }
+
+ code.l(end);
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::ArithmeticShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ RegAlloc::Realize(Xresult, Xoperand);
+ code.ASR(Xresult, Xoperand, shift <= 63 ? shift : 63);
+ } else {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ auto Xshift = ctx.reg_alloc.ReadX(shift_arg);
+ RegAlloc::Realize(Xresult, Xoperand, Xshift);
+ code.ASR(Xresult, Xoperand, Xshift);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::RotateRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) {
+ if (carry_inst) {
+ ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg);
+ }
+ ctx.reg_alloc.DefineAsExisting(inst, operand_arg);
+ } else if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8() % 32;
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.ROR(Wresult, Woperand, shift);
+
+ if (carry_inst) {
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wcarry_out);
+
+ code.ROR(Wcarry_out, Woperand, ((shift + 31) - 29) % 32);
+ code.AND(Wcarry_out, Wcarry_out, 1 << 29);
+ }
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ RegAlloc::Realize(Wresult, Woperand, Wshift);
+
+ code.ROR(Wresult, Woperand, Wshift);
+
+ if (carry_inst && carry_arg.IsImmediate()) {
+ const u32 carry_in = carry_arg.GetImmediateU32() << 29;
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wcarry_out);
+ ctx.reg_alloc.SpillFlags();
+
+ code.TST(Wshift, 0xff);
+ code.LSR(Wcarry_out, Wresult, 31 - 29);
+ code.AND(Wcarry_out, Wcarry_out, 1 << 29);
+ if (carry_in) {
+ code.MOV(Wscratch0, carry_in);
+ code.CSEL(Wcarry_out, Wscratch0, Wcarry_out, EQ);
+ } else {
+ code.CSEL(Wcarry_out, WZR, Wcarry_out, EQ);
+ }
+ } else if (carry_inst) {
+ auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg);
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wcarry_out, Wcarry_in);
+ ctx.reg_alloc.SpillFlags();
+
+ code.TST(Wshift, 0xff);
+ code.LSR(Wcarry_out, Wresult, 31 - 29);
+ code.AND(Wcarry_out, Wcarry_out, 1 << 29);
+ code.CSEL(Wcarry_out, Wcarry_in, Wcarry_out, EQ);
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::RotateRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ RegAlloc::Realize(Xresult, Xoperand);
+ code.ROR(Xresult, Xoperand, shift);
+ } else {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ auto Xshift = ctx.reg_alloc.ReadX(shift_arg);
+ RegAlloc::Realize(Xresult, Xoperand, Xshift);
+ code.ROR(Xresult, Xoperand, Xshift);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::RotateRightExtended>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+
+ if (args[1].IsImmediate()) {
+ RegAlloc::Realize(Wresult, Woperand);
+
+ code.LSR(Wresult, Woperand, 1);
+ if (args[1].GetImmediateU1()) {
+ code.ORR(Wresult, Wresult, 0x8000'0000);
+ }
+ } else {
+ auto Wcarry_in = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wresult, Woperand, Wcarry_in);
+
+ code.LSR(Wscratch0, Wcarry_in, 29);
+ code.EXTR(Wresult, Wscratch0, Woperand, 1);
+ }
+
+ if (carry_inst) {
+ auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst);
+ RegAlloc::Realize(Wcarry_out);
+ code.UBFIZ(Wcarry_out, Woperand, 29, 1);
+ }
+}
+
+template<typename ShiftI, typename ShiftR>
+static void EmitMaskedShift32(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, ShiftI si_fn, ShiftR sr_fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ RegAlloc::Realize(Wresult, Woperand);
+ const u32 shift = shift_arg.GetImmediateU32();
+
+ si_fn(Wresult, Woperand, static_cast<int>(shift & 0x1F));
+ } else {
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(operand_arg);
+ auto Wshift = ctx.reg_alloc.ReadW(shift_arg);
+ RegAlloc::Realize(Wresult, Woperand, Wshift);
+
+ sr_fn(Wresult, Woperand, Wshift);
+ }
+}
+
+template<typename ShiftI, typename ShiftR>
+static void EmitMaskedShift64(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, ShiftI si_fn, ShiftR sr_fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ RegAlloc::Realize(Xresult, Xoperand);
+ const u32 shift = shift_arg.GetImmediateU64();
+
+ si_fn(Xresult, Xoperand, static_cast<int>(shift & 0x3F));
+ } else {
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xoperand = ctx.reg_alloc.ReadX(operand_arg);
+ auto Xshift = ctx.reg_alloc.ReadX(shift_arg);
+ RegAlloc::Realize(Xresult, Xoperand, Xshift);
+
+ sr_fn(Xresult, Xoperand, Xshift);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftLeftMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand, auto shift) { code.LSL(Wresult, Woperand, shift); },
+ [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.LSL(Wresult, Woperand, Wshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftLeftMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand, auto shift) { code.LSL(Xresult, Xoperand, shift); },
+ [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.LSL(Xresult, Xoperand, Xshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand, auto shift) { code.LSR(Wresult, Woperand, shift); },
+ [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.LSR(Wresult, Woperand, Wshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::LogicalShiftRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand, auto shift) { code.LSR(Xresult, Xoperand, shift); },
+ [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.LSR(Xresult, Xoperand, Xshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ArithmeticShiftRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand, auto shift) { code.ASR(Wresult, Woperand, shift); },
+ [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.ASR(Wresult, Woperand, Wshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ArithmeticShiftRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand, auto shift) { code.ASR(Xresult, Xoperand, shift); },
+ [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.ASR(Xresult, Xoperand, Xshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::RotateRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand, auto shift) { code.ROR(Wresult, Woperand, shift); },
+ [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.ROR(Wresult, Woperand, Wshift); });
+}
+
+template<>
+void EmitIR<IR::Opcode::RotateRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand, auto shift) { code.ROR(Xresult, Xoperand, shift); },
+ [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.ROR(Xresult, Xoperand, Xshift); });
+}
+
+template<size_t bitsize, typename EmitFn>
+static void MaybeAddSubImm(oaknut::CodeGenerator& code, u64 imm, EmitFn emit_fn) {
+ static_assert(bitsize == 32 || bitsize == 64);
+ if constexpr (bitsize == 32) {
+ imm = static_cast<u32>(imm);
+ }
+ if (oaknut::AddSubImm::is_valid(imm)) {
+ emit_fn(imm);
+ } else {
+ code.MOV(Rscratch0<bitsize>(), imm);
+ emit_fn(Rscratch0<bitsize>());
+ }
+}
+
+template<size_t bitsize, bool sub>
+static void EmitAddSub(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst);
+ auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]);
+
+ if (overflow_inst) {
+ // There is a limited set of circumstances where this is required, so assert for this.
+ ASSERT(!sub);
+ ASSERT(!nzcv_inst);
+ ASSERT(args[2].IsImmediate() && args[2].GetImmediateU1() == false);
+
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ ctx.reg_alloc.SpillFlags();
+ RegAlloc::Realize(Rresult, Ra, Rb, Woverflow);
+
+ code.ADDS(Rresult, *Ra, Rb);
+ code.CSET(Woverflow, VS);
+ } else if (nzcv_inst) {
+ if (args[1].IsImmediate()) {
+ const u64 imm = args[1].GetImmediateU64();
+
+ if (args[2].IsImmediate()) {
+ auto flags = ctx.reg_alloc.WriteFlags(nzcv_inst);
+ RegAlloc::Realize(Rresult, Ra, flags);
+
+ if (args[2].GetImmediateU1()) {
+ MaybeAddSubImm<bitsize>(code, sub ? imm : ~imm, [&](const auto b) { code.SUBS(Rresult, *Ra, b); });
+ } else {
+ MaybeAddSubImm<bitsize>(code, sub ? ~imm : imm, [&](const auto b) { code.ADDS(Rresult, *Ra, b); });
+ }
+ } else {
+ RegAlloc::Realize(Rresult, Ra);
+ ctx.reg_alloc.ReadWriteFlags(args[2], nzcv_inst);
+
+ if (imm == 0) {
+ if constexpr (bitsize == 32) {
+ sub ? code.SBCS(Rresult, Ra, WZR) : code.ADCS(Rresult, Ra, WZR);
+ } else {
+ sub ? code.SBCS(Rresult, Ra, XZR) : code.ADCS(Rresult, Ra, XZR);
+ }
+ } else {
+ code.MOV(Rscratch0<bitsize>(), imm);
+ sub ? code.SBCS(Rresult, Ra, Rscratch0<bitsize>()) : code.ADCS(Rresult, Ra, Rscratch0<bitsize>());
+ }
+ }
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+
+ if (args[2].IsImmediate()) {
+ auto flags = ctx.reg_alloc.WriteFlags(nzcv_inst);
+ RegAlloc::Realize(Rresult, Ra, Rb, flags);
+
+ if (args[2].GetImmediateU1()) {
+ if (sub) {
+ code.SUBS(Rresult, *Ra, Rb);
+ } else {
+ code.MVN(Rscratch0<bitsize>(), Rb);
+ code.SUBS(Rresult, *Ra, Rscratch0<bitsize>());
+ }
+ } else {
+ if (sub) {
+ code.MVN(Rscratch0<bitsize>(), Rb);
+ code.ADDS(Rresult, *Ra, Rscratch0<bitsize>());
+ } else {
+ code.ADDS(Rresult, *Ra, Rb);
+ }
+ }
+ } else {
+ RegAlloc::Realize(Rresult, Ra, Rb);
+ ctx.reg_alloc.ReadWriteFlags(args[2], nzcv_inst);
+
+ sub ? code.SBCS(Rresult, Ra, Rb) : code.ADCS(Rresult, Ra, Rb);
+ }
+ }
+ } else {
+ if (args[1].IsImmediate()) {
+ const u64 imm = args[1].GetImmediateU64();
+
+ RegAlloc::Realize(Rresult, Ra);
+
+ if (args[2].IsImmediate()) {
+ if (args[2].GetImmediateU1()) {
+ MaybeAddSubImm<bitsize>(code, sub ? imm : ~imm, [&](const auto b) { code.SUB(Rresult, *Ra, b); });
+ } else {
+ MaybeAddSubImm<bitsize>(code, sub ? ~imm : imm, [&](const auto b) { code.ADD(Rresult, *Ra, b); });
+ }
+ } else {
+ ctx.reg_alloc.ReadWriteFlags(args[2], nullptr);
+
+ if (imm == 0) {
+ if constexpr (bitsize == 32) {
+ sub ? code.SBC(Rresult, Ra, WZR) : code.ADC(Rresult, Ra, WZR);
+ } else {
+ sub ? code.SBC(Rresult, Ra, XZR) : code.ADC(Rresult, Ra, XZR);
+ }
+ } else {
+ code.MOV(Rscratch0<bitsize>(), imm);
+ sub ? code.SBC(Rresult, Ra, Rscratch0<bitsize>()) : code.ADC(Rresult, Ra, Rscratch0<bitsize>());
+ }
+ }
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+
+ RegAlloc::Realize(Rresult, Ra, Rb);
+
+ if (args[2].IsImmediate()) {
+ if (args[2].GetImmediateU1()) {
+ if (sub) {
+ code.SUB(Rresult, *Ra, Rb);
+ } else {
+ code.MVN(Rscratch0<bitsize>(), Rb);
+ code.SUB(Rresult, *Ra, Rscratch0<bitsize>());
+ }
+ } else {
+ if (sub) {
+ code.MVN(Rscratch0<bitsize>(), Rb);
+ code.ADD(Rresult, *Ra, Rscratch0<bitsize>());
+ } else {
+ code.ADD(Rresult, *Ra, Rb);
+ }
+ }
+ } else {
+ ctx.reg_alloc.ReadWriteFlags(args[2], nullptr);
+
+ sub ? code.SBC(Rresult, Ra, Rb) : code.ADC(Rresult, Ra, Rb);
+ }
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::Add32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAddSub<32, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::Add64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAddSub<64, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::Sub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAddSub<32, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::Sub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAddSub<64, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::Mul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Wa, auto& Wb) { code.MUL(Wresult, Wa, Wb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Mul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xa, auto& Xb) { code.MUL(Xresult, Xa, Xb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedMultiplyHigh64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xop1 = ctx.reg_alloc.ReadX(args[0]);
+ auto Xop2 = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xop1, Xop2);
+
+ code.SMULH(Xresult, Xop1, Xop2);
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedMultiplyHigh64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xop1 = ctx.reg_alloc.ReadX(args[0]);
+ auto Xop2 = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xop1, Xop2);
+
+ code.UMULH(Xresult, Xop1, Xop2);
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Wa, auto& Wb) { code.UDIV(Wresult, Wa, Wb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xa, auto& Xb) { code.UDIV(Xresult, Xa, Xb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Wa, auto& Wb) { code.SDIV(Wresult, Wa, Wb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xa, auto& Xb) { code.SDIV(Xresult, Xa, Xb); });
+}
+
+template<size_t bitsize>
+static bool IsValidBitImm(u64 imm) {
+ static_assert(bitsize == 32 || bitsize == 64);
+ if constexpr (bitsize == 32) {
+ return static_cast<bool>(oaknut::detail::encode_bit_imm(static_cast<u32>(imm)));
+ } else {
+ return static_cast<bool>(oaknut::detail::encode_bit_imm(imm));
+ }
+}
+
+template<size_t bitsize, typename EmitFn>
+static void MaybeBitImm(oaknut::CodeGenerator& code, u64 imm, EmitFn emit_fn) {
+ static_assert(bitsize == 32 || bitsize == 64);
+ if constexpr (bitsize == 32) {
+ imm = static_cast<u32>(imm);
+ }
+ if (IsValidBitImm<bitsize>(imm)) {
+ emit_fn(imm);
+ } else {
+ code.MOV(Rscratch0<bitsize>(), imm);
+ emit_fn(Rscratch0<bitsize>());
+ }
+}
+
+template<size_t bitsize, typename EmitFn1, typename EmitFn2 = std::nullptr_t>
+static void EmitBitOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn1 emit_without_flags, EmitFn2 emit_with_flags = nullptr) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst);
+ auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]);
+
+ if constexpr (!std::is_same_v<EmitFn2, std::nullptr_t>) {
+ const auto nz_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZFromOp);
+ const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+ ASSERT(!(nz_inst && nzcv_inst));
+ const auto flag_inst = nz_inst ? nz_inst : nzcv_inst;
+
+ if (flag_inst) {
+ auto Wflags = ctx.reg_alloc.WriteFlags(flag_inst);
+
+ if (args[1].IsImmediate()) {
+ RegAlloc::Realize(Rresult, Ra, Wflags);
+
+ MaybeBitImm<bitsize>(code, args[1].GetImmediateU64(), [&](const auto& b) { emit_with_flags(Rresult, Ra, b); });
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Rresult, Ra, Rb, Wflags);
+
+ emit_with_flags(Rresult, Ra, Rb);
+ }
+
+ return;
+ }
+ }
+
+ if (args[1].IsImmediate()) {
+ RegAlloc::Realize(Rresult, Ra);
+
+ MaybeBitImm<bitsize>(code, args[1].GetImmediateU64(), [&](const auto& b) { emit_without_flags(Rresult, Ra, b); });
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Rresult, Ra, Rb);
+
+ emit_without_flags(Rresult, Ra, Rb);
+ }
+}
+
+template<size_t bitsize>
+static void EmitAndNot(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto nz_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZFromOp);
+ const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+ ASSERT(!(nz_inst && nzcv_inst));
+ const auto flag_inst = nz_inst ? nz_inst : nzcv_inst;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst);
+ auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]);
+
+ if (flag_inst) {
+ auto Wflags = ctx.reg_alloc.WriteFlags(flag_inst);
+
+ if (args[1].IsImmediate()) {
+ RegAlloc::Realize(Rresult, Ra, Wflags);
+
+ const u64 not_imm = bitsize == 32 ? static_cast<u32>(~args[1].GetImmediateU64()) : ~args[1].GetImmediateU64();
+
+ if (IsValidBitImm<bitsize>(not_imm)) {
+ code.ANDS(Rresult, Ra, not_imm);
+ } else {
+ code.MOV(Rscratch0<bitsize>(), args[1].GetImmediateU64());
+ code.BICS(Rresult, Ra, Rscratch0<bitsize>());
+ }
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Rresult, Ra, Rb, Wflags);
+
+ code.BICS(Rresult, Ra, Rb);
+ }
+
+ return;
+ }
+
+ if (args[1].IsImmediate()) {
+ RegAlloc::Realize(Rresult, Ra);
+
+ const u64 not_imm = bitsize == 32 ? static_cast<u32>(~args[1].GetImmediateU64()) : ~args[1].GetImmediateU64();
+
+ if (IsValidBitImm<bitsize>(not_imm)) {
+ code.AND(Rresult, Ra, not_imm);
+ } else {
+ code.MOV(Rscratch0<bitsize>(), args[1].GetImmediateU64());
+ code.BIC(Rresult, Ra, Rscratch0<bitsize>());
+ }
+ } else {
+ auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]);
+ RegAlloc::Realize(Rresult, Ra, Rb);
+
+ code.BIC(Rresult, Ra, Rb);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::And32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<32>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.AND(result, a, b); },
+ [&](auto& result, auto& a, auto& b) { code.ANDS(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::And64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<64>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.AND(result, a, b); },
+ [&](auto& result, auto& a, auto& b) { code.ANDS(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::AndNot32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAndNot<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::AndNot64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitAndNot<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::Eor32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<32>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.EOR(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Eor64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<64>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.EOR(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Or32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<32>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.ORR(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Or64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBitOp<64>(
+ code, ctx, inst,
+ [&](auto& result, auto& a, auto& b) { code.ORR(result, a, b); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Not32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.MVN(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::Not64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.MVN(Xresult, Xoperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignExtendByteToWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.SXTB(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignExtendHalfToWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.SXTH(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignExtendByteToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.SXTB(Xresult, Xoperand->toW()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignExtendHalfToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.SXTH(Xresult, Xoperand->toW()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::SignExtendWordToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.SXTW(Xresult, Xoperand->toW()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendByteToWord>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendHalfToWord>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendByteToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendHalfToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendWordToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroExtendLongToQuad>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Xvalue, Qresult);
+
+ code.FMOV(Qresult->toD(), Xvalue);
+}
+
+template<>
+void EmitIR<IR::Opcode::ByteReverseWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.REV(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ByteReverseHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.REV16(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ByteReverseDual>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.REV(Xresult, Xoperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CountLeadingZeros32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(
+ code, ctx, inst,
+ [&](auto& Wresult, auto& Woperand) { code.CLZ(Wresult, Woperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::CountLeadingZeros64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(
+ code, ctx, inst,
+ [&](auto& Xresult, auto& Xoperand) { code.CLZ(Xresult, Xoperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ExtractRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[2].IsImmediate());
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wop1 = ctx.reg_alloc.ReadW(args[0]);
+ auto Wop2 = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wresult, Wop1, Wop2);
+ const u8 lsb = args[2].GetImmediateU8();
+
+ code.EXTR(Wresult, Wop2, Wop1, lsb); // NB: flipped
+}
+
+template<>
+void EmitIR<IR::Opcode::ExtractRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[2].IsImmediate());
+
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xop1 = ctx.reg_alloc.ReadX(args[0]);
+ auto Xop2 = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xop1, Xop2);
+ const u8 lsb = args[2].GetImmediateU8();
+
+ code.EXTR(Xresult, Xop2, Xop1, lsb); // NB: flipped
+}
+
+template<>
+void EmitIR<IR::Opcode::ReplicateBit32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wvalue = ctx.reg_alloc.ReadW(args[0]);
+ const u8 bit = args[1].GetImmediateU8();
+ RegAlloc::Realize(Wresult, Wvalue);
+
+ code.LSL(Wresult, Wvalue, 31 - bit);
+ code.ASR(Wresult, Wresult, 31);
+}
+
+template<>
+void EmitIR<IR::Opcode::ReplicateBit64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xvalue = ctx.reg_alloc.ReadX(args[0]);
+ const u8 bit = args[1].GetImmediateU8();
+ RegAlloc::Realize(Xresult, Xvalue);
+
+ code.LSL(Xresult, Xvalue, 63 - bit);
+ code.ASR(Xresult, Xresult, 63);
+}
+
+static void EmitMaxMin32(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Cond cond) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wop1 = ctx.reg_alloc.ReadW(args[0]);
+ auto Wop2 = ctx.reg_alloc.ReadW(args[1]);
+ RegAlloc::Realize(Wresult, Wop1, Wop2);
+ ctx.reg_alloc.SpillFlags();
+
+ code.CMP(Wop1->toW(), Wop2);
+ code.CSEL(Wresult, Wop1, Wop2, cond);
+}
+
+static void EmitMaxMin64(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Cond cond) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Xresult = ctx.reg_alloc.WriteX(inst);
+ auto Xop1 = ctx.reg_alloc.ReadX(args[0]);
+ auto Xop2 = ctx.reg_alloc.ReadX(args[1]);
+ RegAlloc::Realize(Xresult, Xop1, Xop2);
+ ctx.reg_alloc.SpillFlags();
+
+ code.CMP(Xop1->toX(), Xop2);
+ code.CSEL(Xresult, Xop1, Xop2, cond);
+}
+
+template<>
+void EmitIR<IR::Opcode::MaxSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin32(code, ctx, inst, GT);
+}
+
+template<>
+void EmitIR<IR::Opcode::MaxSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin64(code, ctx, inst, GT);
+}
+
+template<>
+void EmitIR<IR::Opcode::MaxUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin32(code, ctx, inst, HI);
+}
+
+template<>
+void EmitIR<IR::Opcode::MaxUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin64(code, ctx, inst, HI);
+}
+
+template<>
+void EmitIR<IR::Opcode::MinSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin32(code, ctx, inst, LT);
+}
+
+template<>
+void EmitIR<IR::Opcode::MinSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin64(code, ctx, inst, LT);
+}
+
+template<>
+void EmitIR<IR::Opcode::MinUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin32(code, ctx, inst, LO);
+}
+
+template<>
+void EmitIR<IR::Opcode::MinUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitMaxMin64(code, ctx, inst, LO);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp
new file mode 100644
index 0000000000..22ec0ec24a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp
@@ -0,0 +1,801 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<size_t bitsize, typename EmitFn>
+static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst);
+ auto Voperand = ctx.reg_alloc.ReadVec<bitsize>(args[0]);
+ RegAlloc::Realize(Vresult, Voperand);
+ ctx.fpsr.Load();
+
+ emit(Vresult, Voperand);
+}
+
+template<size_t bitsize, typename EmitFn>
+static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst);
+ auto Va = ctx.reg_alloc.ReadVec<bitsize>(args[0]);
+ auto Vb = ctx.reg_alloc.ReadVec<bitsize>(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+ ctx.fpsr.Load();
+
+ emit(Vresult, Va, Vb);
+}
+
+template<size_t bitsize, typename EmitFn>
+static void EmitFourOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst);
+ auto Va = ctx.reg_alloc.ReadVec<bitsize>(args[0]);
+ auto Vb = ctx.reg_alloc.ReadVec<bitsize>(args[1]);
+ auto Vc = ctx.reg_alloc.ReadVec<bitsize>(args[2]);
+ RegAlloc::Realize(Vresult, Va, Vb, Vc);
+ ctx.fpsr.Load();
+
+ emit(Vresult, Va, Vb, Vc);
+}
+
+template<size_t bitsize_from, size_t bitsize_to, typename EmitFn>
+static void EmitConvert(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vto = ctx.reg_alloc.WriteVec<bitsize_to>(inst);
+ auto Vfrom = ctx.reg_alloc.ReadVec<bitsize_from>(args[0]);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+ RegAlloc::Realize(Vto, Vfrom);
+ ctx.fpsr.Load();
+
+ ASSERT(rounding_mode == ctx.FPCR().RMode());
+
+ emit(Vto, Vfrom);
+}
+
+template<size_t bitsize_from, size_t bitsize_to, bool is_signed>
+static void EmitToFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Rto = ctx.reg_alloc.WriteReg<std::max<size_t>(bitsize_to, 32)>(inst);
+ auto Vfrom = ctx.reg_alloc.ReadVec<bitsize_from>(args[0]);
+ const size_t fbits = args[1].GetImmediateU8();
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ RegAlloc::Realize(Rto, Vfrom);
+ ctx.fpsr.Load();
+
+ if (rounding_mode == FP::RoundingMode::TowardsZero) {
+ if constexpr (is_signed) {
+ if constexpr (bitsize_to == 16) {
+ code.FCVTZS(Rto, Vfrom, fbits + 16);
+ code.ASR(Wscratch0, Rto, 31);
+ code.ADD(Rto, Rto, Wscratch0, LSR, 16); // Round towards zero when truncating
+ code.LSR(Rto, Rto, 16);
+ } else if (fbits) {
+ code.FCVTZS(Rto, Vfrom, fbits);
+ } else {
+ code.FCVTZS(Rto, Vfrom);
+ }
+ } else {
+ if constexpr (bitsize_to == 16) {
+ code.FCVTZU(Rto, Vfrom, fbits + 16);
+ code.LSR(Rto, Rto, 16);
+ } else if (fbits) {
+ code.FCVTZU(Rto, Vfrom, fbits);
+ } else {
+ code.FCVTZU(Rto, Vfrom);
+ }
+ }
+ } else {
+ ASSERT(fbits == 0);
+ ASSERT(bitsize_to != 16);
+ if constexpr (is_signed) {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FCVTNS(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FCVTPS(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FCVTMS(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FCVTZS(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FCVTAS(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::ToOdd:
+ ASSERT_FALSE("Unimplemented");
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ break;
+ }
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FCVTNU(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FCVTPU(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FCVTMU(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FCVTZU(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FCVTAU(Rto, Vfrom);
+ break;
+ case FP::RoundingMode::ToOdd:
+ ASSERT_FALSE("Unimplemented");
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ break;
+ }
+ }
+ }
+}
+
+template<size_t bitsize_from, size_t bitsize_to, typename EmitFn>
+static void EmitFromFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vto = ctx.reg_alloc.WriteVec<bitsize_to>(inst);
+ auto Rfrom = ctx.reg_alloc.ReadReg<std::max<size_t>(bitsize_from, 32)>(args[0]);
+ const size_t fbits = args[1].GetImmediateU8();
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ RegAlloc::Realize(Vto, Rfrom);
+ ctx.fpsr.Load();
+
+ if (rounding_mode == ctx.FPCR().RMode()) {
+ emit(Vto, Rfrom, fbits);
+ } else {
+ FP::FPCR new_fpcr = ctx.FPCR();
+ new_fpcr.RMode(rounding_mode);
+
+ code.MOV(Wscratch0, new_fpcr.Value());
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+
+ emit(Vto, Rfrom, fbits);
+
+ code.MOV(Wscratch0, ctx.FPCR().Value());
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::FPAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FABS(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FABS(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FADD(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FADD(Dresult, Da, Db); });
+}
+
+template<size_t size>
+void EmitCompare(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto flags = ctx.reg_alloc.WriteFlags(inst);
+ auto Va = ctx.reg_alloc.ReadVec<size>(args[0]);
+ const bool exc_on_qnan = args[2].GetImmediateU1();
+
+ if (args[1].IsImmediate() && args[1].GetImmediateU64() == 0) {
+ RegAlloc::Realize(flags, Va);
+ ctx.fpsr.Load();
+
+ if (exc_on_qnan) {
+ code.FCMPE(Va, 0);
+ } else {
+ code.FCMP(Va, 0);
+ }
+ } else {
+ auto Vb = ctx.reg_alloc.ReadVec<size>(args[1]);
+ RegAlloc::Realize(flags, Va, Vb);
+ ctx.fpsr.Load();
+
+ if (exc_on_qnan) {
+ code.FCMPE(Va, Vb);
+ } else {
+ code.FCMP(Va, Vb);
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::FPCompare32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCompare<32>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPCompare64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitCompare<64>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FDIV(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FDIV(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMax32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMAX(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMax64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMAX(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMaxNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMAXNM(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMaxNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMAXNM(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMIN(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMin64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMIN(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMinNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMINNM(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMinNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMINNM(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMUL(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMUL(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMADD(Sresult, S1, S2, Sa); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMADD(Dresult, D1, D2, Da); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMSUB(Sresult, S1, S2, Sa); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMSUB(Dresult, D1, D2, Da); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMULX(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPMulX64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMULX(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FNEG(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FNEG(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRECPE(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRECPE(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipExponent16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipExponent32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRECPX(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipExponent64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRECPX(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FRECPS(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRecipStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FRECPS(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRoundInt16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRoundInt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Sresult = ctx.reg_alloc.WriteS(inst);
+ auto Soperand = ctx.reg_alloc.ReadS(args[0]);
+ RegAlloc::Realize(Sresult, Soperand);
+ ctx.fpsr.Load();
+
+ if (exact) {
+ ASSERT(ctx.FPCR().RMode() == rounding_mode);
+ code.FRINTX(Sresult, Soperand);
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FRINTN(Sresult, Soperand);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FRINTP(Sresult, Soperand);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FRINTM(Sresult, Soperand);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FRINTZ(Sresult, Soperand);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FRINTA(Sresult, Soperand);
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRoundInt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ auto Doperand = ctx.reg_alloc.ReadD(args[0]);
+ RegAlloc::Realize(Dresult, Doperand);
+ ctx.fpsr.Load();
+
+ if (exact) {
+ ASSERT(ctx.FPCR().RMode() == rounding_mode);
+ code.FRINTX(Dresult, Doperand);
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FRINTN(Dresult, Doperand);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FRINTP(Dresult, Doperand);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FRINTM(Dresult, Doperand);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FRINTZ(Dresult, Doperand);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FRINTA(Dresult, Doperand);
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ }
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRSQRTE(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRSQRTE(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FRSQRTS(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPRSqrtStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FRSQRTS(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSqrt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FSQRT(Sresult, Soperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSqrt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FSQRT(Dresult, Doperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FSUB(Sresult, Sa, Sb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FSUB(Dresult, Da, Db); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitConvert<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Hfrom) { code.FCVT(Dto, Hfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitConvert<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Hfrom) { code.FCVT(Sto, Hfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitConvert<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Sfrom) { code.FCVT(Dto, Sfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitConvert<32, 16>(code, ctx, inst, [&](auto& Hto, auto& Sfrom) { code.FCVT(Hto, Sfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitConvert<64, 16>(code, ctx, inst, [&](auto& Hto, auto& Dfrom) { code.FCVT(Hto, Dfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+
+ if (rounding_mode == FP::RoundingMode::ToOdd) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Sto = ctx.reg_alloc.WriteS(inst);
+ auto Dfrom = ctx.reg_alloc.ReadD(args[0]);
+ RegAlloc::Realize(Sto, Dfrom);
+ ctx.fpsr.Load();
+
+ code.FCVTXN(Sto, Dfrom);
+
+ return;
+ }
+
+ EmitConvert<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Dfrom) { code.FCVT(Sto, Dfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, 16, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, 32, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitToFixed<64, 64, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, 16, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, 32, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPDoubleToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitToFixed<64, 64, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPHalfToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, 16, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitToFixed<32, 32, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, 64, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, 16, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitToFixed<32, 32, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPSingleToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, 64, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU16ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) {
+ code.LSL(Wscratch0, Wfrom, 16);
+ code.UCVTF(Sto, Wscratch0, fbits + 16);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS16ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) {
+ code.LSL(Wscratch0, Wfrom, 16);
+ code.SCVTF(Sto, Wscratch0, fbits + 16);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU16ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) {
+ code.LSL(Wscratch0, Wfrom, 16);
+ code.UCVTF(Dto, Wscratch0, fbits + 16);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS16ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) {
+ code.LSL(Wscratch0, Wfrom, 16);
+ code.SCVTF(Dto, Wscratch0, fbits + 16);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU32ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitFromFixed<32, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { fbits ? code.UCVTF(Sto, Wfrom, fbits) : code.UCVTF(Sto, Wfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS32ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitFromFixed<32, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { fbits ? code.SCVTF(Sto, Wfrom, fbits) : code.SCVTF(Sto, Wfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU32ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { fbits ? code.UCVTF(Dto, Wfrom, fbits) : code.UCVTF(Dto, Wfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS32ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { fbits ? code.SCVTF(Dto, Wfrom, fbits) : code.SCVTF(Dto, Wfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU64ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitFromFixed<64, 64>(code, ctx, inst, [&](auto& Dto, auto& Xfrom, u8 fbits) { fbits ? code.UCVTF(Dto, Xfrom, fbits) : code.UCVTF(Dto, Xfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedU64ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Xfrom, u8 fbits) { fbits ? code.UCVTF(Sto, Xfrom, fbits) : code.UCVTF(Sto, Xfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS64ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ // TODO: Consider fpr source
+ EmitFromFixed<64, 64>(code, ctx, inst, [&](auto& Dto, auto& Xfrom, u8 fbits) { fbits ? code.SCVTF(Dto, Xfrom, fbits) : code.SCVTF(Dto, Xfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPFixedS64ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Xfrom, u8 fbits) { fbits ? code.SCVTF(Sto, Xfrom, fbits) : code.SCVTF(Sto, Xfrom); });
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp
new file mode 100644
index 0000000000..339e2c59a0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp
@@ -0,0 +1,683 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/emit_arm64_memory.h"
+
+#include <optional>
+#include <utility>
+
+#include <mcl/bit_cast.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fastmem.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/acc_type.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+namespace {
+
+bool IsOrdered(IR::AccType acctype) {
+ return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED;
+}
+
+LinkTarget ReadMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::ReadMemory8;
+ case 16:
+ return LinkTarget::ReadMemory16;
+ case 32:
+ return LinkTarget::ReadMemory32;
+ case 64:
+ return LinkTarget::ReadMemory64;
+ case 128:
+ return LinkTarget::ReadMemory128;
+ }
+ UNREACHABLE();
+}
+
+LinkTarget WriteMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::WriteMemory8;
+ case 16:
+ return LinkTarget::WriteMemory16;
+ case 32:
+ return LinkTarget::WriteMemory32;
+ case 64:
+ return LinkTarget::WriteMemory64;
+ case 128:
+ return LinkTarget::WriteMemory128;
+ }
+ UNREACHABLE();
+}
+
+LinkTarget WrappedReadMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::WrappedReadMemory8;
+ case 16:
+ return LinkTarget::WrappedReadMemory16;
+ case 32:
+ return LinkTarget::WrappedReadMemory32;
+ case 64:
+ return LinkTarget::WrappedReadMemory64;
+ case 128:
+ return LinkTarget::WrappedReadMemory128;
+ }
+ UNREACHABLE();
+}
+
+LinkTarget WrappedWriteMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::WrappedWriteMemory8;
+ case 16:
+ return LinkTarget::WrappedWriteMemory16;
+ case 32:
+ return LinkTarget::WrappedWriteMemory32;
+ case 64:
+ return LinkTarget::WrappedWriteMemory64;
+ case 128:
+ return LinkTarget::WrappedWriteMemory128;
+ }
+ UNREACHABLE();
+}
+
+LinkTarget ExclusiveReadMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::ExclusiveReadMemory8;
+ case 16:
+ return LinkTarget::ExclusiveReadMemory16;
+ case 32:
+ return LinkTarget::ExclusiveReadMemory32;
+ case 64:
+ return LinkTarget::ExclusiveReadMemory64;
+ case 128:
+ return LinkTarget::ExclusiveReadMemory128;
+ }
+ UNREACHABLE();
+}
+
+LinkTarget ExclusiveWriteMemoryLinkTarget(size_t bitsize) {
+ switch (bitsize) {
+ case 8:
+ return LinkTarget::ExclusiveWriteMemory8;
+ case 16:
+ return LinkTarget::ExclusiveWriteMemory16;
+ case 32:
+ return LinkTarget::ExclusiveWriteMemory32;
+ case 64:
+ return LinkTarget::ExclusiveWriteMemory64;
+ case 128:
+ return LinkTarget::ExclusiveWriteMemory128;
+ }
+ UNREACHABLE();
+}
+
+template<size_t bitsize>
+void CallbackOnlyEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[1]);
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+
+ EmitRelocation(code, ctx, ReadMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+
+ if constexpr (bitsize == 128) {
+ code.MOV(Q8.B16(), Q0.B16());
+ ctx.reg_alloc.DefineAsRegister(inst, Q8);
+ } else {
+ ctx.reg_alloc.DefineAsRegister(inst, X0);
+ }
+}
+
+template<size_t bitsize>
+void CallbackOnlyEmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[1]);
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+
+ code.MOV(Wscratch0, 1);
+ code.STRB(Wscratch0, Xstate, ctx.conf.state_exclusive_state_offset);
+ EmitRelocation(code, ctx, ExclusiveReadMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+
+ if constexpr (bitsize == 128) {
+ code.MOV(Q8.B16(), Q0.B16());
+ ctx.reg_alloc.DefineAsRegister(inst, Q8);
+ } else {
+ ctx.reg_alloc.DefineAsRegister(inst, X0);
+ }
+}
+
+template<size_t bitsize>
+void CallbackOnlyEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[1], args[2]);
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ EmitRelocation(code, ctx, WriteMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+}
+
+template<size_t bitsize>
+void CallbackOnlyEmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.PrepareForCall({}, args[1], args[2]);
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+
+ oaknut::Label end;
+
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ code.MOV(W0, 1);
+ code.LDRB(Wscratch0, Xstate, ctx.conf.state_exclusive_state_offset);
+ code.CBZ(Wscratch0, end);
+ code.STRB(WZR, Xstate, ctx.conf.state_exclusive_state_offset);
+ EmitRelocation(code, ctx, ExclusiveWriteMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ code.l(end);
+ ctx.reg_alloc.DefineAsRegister(inst, X0);
+}
+
+constexpr size_t page_bits = 12;
+constexpr size_t page_size = 1 << page_bits;
+constexpr size_t page_mask = (1 << page_bits) - 1;
+
+// This function may use Xscratch0 as a scratch register
+// Trashes NZCV
+template<size_t bitsize>
+void EmitDetectMisalignedVAddr(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) {
+ static_assert(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64 || bitsize == 128);
+
+ if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) {
+ return;
+ }
+
+ if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
+ const u64 align_mask = []() -> u64 {
+ switch (bitsize) {
+ case 16:
+ return 0b1;
+ case 32:
+ return 0b11;
+ case 64:
+ return 0b111;
+ case 128:
+ return 0b1111;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ code.TST(Xaddr, align_mask);
+ code.B(NE, *fallback);
+ } else {
+ // If (addr & page_mask) > page_size - byte_size, use fallback.
+ code.AND(Xscratch0, Xaddr, page_mask);
+ code.CMP(Xscratch0, page_size - bitsize / 8);
+ code.B(HI, *fallback);
+ }
+}
+
+// Outputs Xscratch0 = page_table[addr >> page_bits]
+// May use Xscratch1 as scratch register
+// Address to read/write = [ret0 + ret1], ret0 is always Xscratch0 and ret1 is either Xaddr or Xscratch1
+// Trashes NZCV
+template<size_t bitsize>
+std::pair<oaknut::XReg, oaknut::XReg> InlinePageTableEmitVAddrLookup(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) {
+ const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits;
+ const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits;
+
+ EmitDetectMisalignedVAddr<bitsize>(code, ctx, Xaddr, fallback);
+
+ if (ctx.conf.silently_mirror_page_table || unused_top_bits == 0) {
+ code.UBFX(Xscratch0, Xaddr, page_bits, valid_page_index_bits);
+ } else {
+ code.LSR(Xscratch0, Xaddr, page_bits);
+ code.TST(Xscratch0, u64(~u64(0)) << valid_page_index_bits);
+ code.B(NE, *fallback);
+ }
+
+ code.LDR(Xscratch0, Xpagetable, Xscratch0, LSL, 3);
+
+ if (ctx.conf.page_table_pointer_mask_bits != 0) {
+ const u64 mask = u64(~u64(0)) << ctx.conf.page_table_pointer_mask_bits;
+ code.AND(Xscratch0, Xscratch0, mask);
+ }
+
+ code.CBZ(Xscratch0, *fallback);
+
+ if (ctx.conf.absolute_offset_page_table) {
+ return std::make_pair(Xscratch0, Xaddr);
+ }
+ code.AND(Xscratch1, Xaddr, page_mask);
+ return std::make_pair(Xscratch0, Xscratch1);
+}
+
+template<std::size_t bitsize>
+CodePtr EmitMemoryLdr(oaknut::CodeGenerator& code, int value_idx, oaknut::XReg Xbase, oaknut::XReg Xoffset, bool ordered, bool extend32 = false) {
+ const auto index_ext = extend32 ? oaknut::IndexExt::UXTW : oaknut::IndexExt::LSL;
+ const auto add_ext = extend32 ? oaknut::AddSubExt::UXTW : oaknut::AddSubExt::LSL;
+ const auto Roffset = extend32 ? oaknut::RReg{Xoffset.toW()} : oaknut::RReg{Xoffset};
+
+ CodePtr fastmem_location = code.xptr<CodePtr>();
+
+ if (ordered) {
+ code.ADD(Xscratch0, Xbase, Roffset, add_ext);
+
+ fastmem_location = code.xptr<CodePtr>();
+
+ switch (bitsize) {
+ case 8:
+ code.LDARB(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 16:
+ code.LDARH(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 32:
+ code.LDAR(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 64:
+ code.LDAR(oaknut::XReg{value_idx}, Xscratch0);
+ break;
+ case 128:
+ code.LDR(oaknut::QReg{value_idx}, Xscratch0);
+ code.DMB(oaknut::BarrierOp::ISH);
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ } else {
+ fastmem_location = code.xptr<CodePtr>();
+
+ switch (bitsize) {
+ case 8:
+ code.LDRB(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 16:
+ code.LDRH(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 32:
+ code.LDR(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 64:
+ code.LDR(oaknut::XReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 128:
+ code.LDR(oaknut::QReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ }
+
+ return fastmem_location;
+}
+
+template<std::size_t bitsize>
+CodePtr EmitMemoryStr(oaknut::CodeGenerator& code, int value_idx, oaknut::XReg Xbase, oaknut::XReg Xoffset, bool ordered, bool extend32 = false) {
+ const auto index_ext = extend32 ? oaknut::IndexExt::UXTW : oaknut::IndexExt::LSL;
+ const auto add_ext = extend32 ? oaknut::AddSubExt::UXTW : oaknut::AddSubExt::LSL;
+ const auto Roffset = extend32 ? oaknut::RReg{Xoffset.toW()} : oaknut::RReg{Xoffset};
+
+ CodePtr fastmem_location;
+
+ if (ordered) {
+ code.ADD(Xscratch0, Xbase, Roffset, add_ext);
+
+ fastmem_location = code.xptr<CodePtr>();
+
+ switch (bitsize) {
+ case 8:
+ code.STLRB(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 16:
+ code.STLRH(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 32:
+ code.STLR(oaknut::WReg{value_idx}, Xscratch0);
+ break;
+ case 64:
+ code.STLR(oaknut::XReg{value_idx}, Xscratch0);
+ break;
+ case 128:
+ code.DMB(oaknut::BarrierOp::ISH);
+ code.STR(oaknut::QReg{value_idx}, Xscratch0);
+ code.DMB(oaknut::BarrierOp::ISH);
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ } else {
+ fastmem_location = code.xptr<CodePtr>();
+
+ switch (bitsize) {
+ case 8:
+ code.STRB(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 16:
+ code.STRH(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 32:
+ code.STR(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 64:
+ code.STR(oaknut::XReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ case 128:
+ code.STR(oaknut::QReg{value_idx}, Xbase, Roffset, index_ext);
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ }
+
+ return fastmem_location;
+}
+
+template<size_t bitsize>
+void InlinePageTableEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xaddr = ctx.reg_alloc.ReadX(args[1]);
+ auto Rvalue = [&] {
+ if constexpr (bitsize == 128) {
+ return ctx.reg_alloc.WriteQ(inst);
+ } else {
+ return ctx.reg_alloc.WriteReg<std::max<std::size_t>(bitsize, 32)>(inst);
+ }
+ }();
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+ ctx.fpsr.Spill();
+ ctx.reg_alloc.SpillFlags();
+ RegAlloc::Realize(Xaddr, Rvalue);
+
+ SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+
+ const auto [Xbase, Xoffset] = InlinePageTableEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback);
+ EmitMemoryLdr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered);
+
+ ctx.deferred_emits.emplace_back([&code, &ctx, inst, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end] {
+ code.l(*fallback);
+ code.MOV(Xscratch0, Xaddr);
+ EmitRelocation(code, ctx, WrappedReadMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ if constexpr (bitsize == 128) {
+ code.MOV(Rvalue.B16(), Q0.B16());
+ } else {
+ code.MOV(Rvalue.toX(), Xscratch0);
+ }
+ ctx.conf.emit_check_memory_abort(code, ctx, inst, *end);
+ code.B(*end);
+ });
+
+ code.l(*end);
+}
+
+template<size_t bitsize>
+void InlinePageTableEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xaddr = ctx.reg_alloc.ReadX(args[1]);
+ auto Rvalue = [&] {
+ if constexpr (bitsize == 128) {
+ return ctx.reg_alloc.ReadQ(args[2]);
+ } else {
+ return ctx.reg_alloc.ReadReg<std::max<std::size_t>(bitsize, 32)>(args[2]);
+ }
+ }();
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+ ctx.fpsr.Spill();
+ ctx.reg_alloc.SpillFlags();
+ RegAlloc::Realize(Xaddr, Rvalue);
+
+ SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+
+ const auto [Xbase, Xoffset] = InlinePageTableEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback);
+ EmitMemoryStr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered);
+
+ ctx.deferred_emits.emplace_back([&code, &ctx, inst, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end] {
+ code.l(*fallback);
+ if constexpr (bitsize == 128) {
+ code.MOV(Xscratch0, Xaddr);
+ code.MOV(Q0.B16(), Rvalue.B16());
+ } else {
+ code.MOV(Xscratch0, Xaddr);
+ code.MOV(Xscratch1, Rvalue.toX());
+ }
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ EmitRelocation(code, ctx, WrappedWriteMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ ctx.conf.emit_check_memory_abort(code, ctx, inst, *end);
+ code.B(*end);
+ });
+
+ code.l(*end);
+}
+
+std::optional<DoNotFastmemMarker> ShouldFastmem(EmitContext& ctx, IR::Inst* inst) {
+ if (!ctx.conf.fastmem_pointer || !ctx.fastmem.SupportsFastmem()) {
+ return std::nullopt;
+ }
+
+ const auto marker = std::make_tuple(ctx.block.Location(), inst->GetName());
+ if (ctx.fastmem.ShouldFastmem(marker)) {
+ return marker;
+ }
+ return std::nullopt;
+}
+
+inline bool ShouldExt32(EmitContext& ctx) {
+ return ctx.conf.fastmem_address_space_bits == 32 && ctx.conf.silently_mirror_fastmem;
+}
+
+// May use Xscratch0 as scratch register
+// Address to read/write = [ret0 + ret1], ret0 is always Xfastmem and ret1 is either Xaddr or Xscratch0
+// Trashes NZCV
+template<size_t bitsize>
+std::pair<oaknut::XReg, oaknut::XReg> FastmemEmitVAddrLookup(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) {
+ if (ctx.conf.fastmem_address_space_bits == 64 || ShouldExt32(ctx)) {
+ return std::make_pair(Xfastmem, Xaddr);
+ }
+
+ if (ctx.conf.silently_mirror_fastmem) {
+ code.UBFX(Xscratch0, Xaddr, 0, ctx.conf.fastmem_address_space_bits);
+ return std::make_pair(Xfastmem, Xscratch0);
+ }
+
+ code.LSR(Xscratch0, Xaddr, ctx.conf.fastmem_address_space_bits);
+ code.CBNZ(Xscratch0, *fallback);
+ return std::make_pair(Xfastmem, Xaddr);
+}
+
+template<size_t bitsize>
+void FastmemEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, DoNotFastmemMarker marker) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xaddr = ctx.reg_alloc.ReadX(args[1]);
+ auto Rvalue = [&] {
+ if constexpr (bitsize == 128) {
+ return ctx.reg_alloc.WriteQ(inst);
+ } else {
+ return ctx.reg_alloc.WriteReg<std::max<std::size_t>(bitsize, 32)>(inst);
+ }
+ }();
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+ ctx.fpsr.Spill();
+ ctx.reg_alloc.SpillFlags();
+ RegAlloc::Realize(Xaddr, Rvalue);
+
+ SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+
+ const auto [Xbase, Xoffset] = FastmemEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback);
+ const auto fastmem_location = EmitMemoryLdr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered, ShouldExt32(ctx));
+
+ ctx.deferred_emits.emplace_back([&code, &ctx, inst, marker, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end, fastmem_location] {
+ ctx.ebi.fastmem_patch_info.emplace(
+ fastmem_location - ctx.ebi.entry_point,
+ FastmemPatchInfo{
+ .marker = marker,
+ .fc = FakeCall{
+ .call_pc = mcl::bit_cast<u64>(code.xptr<void*>()),
+ },
+ .recompile = ctx.conf.recompile_on_fastmem_failure,
+ });
+
+ code.l(*fallback);
+ code.MOV(Xscratch0, Xaddr);
+ EmitRelocation(code, ctx, WrappedReadMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ if constexpr (bitsize == 128) {
+ code.MOV(Rvalue.B16(), Q0.B16());
+ } else {
+ code.MOV(Rvalue.toX(), Xscratch0);
+ }
+ ctx.conf.emit_check_memory_abort(code, ctx, inst, *end);
+ code.B(*end);
+ });
+
+ code.l(*end);
+}
+
+template<size_t bitsize>
+void FastmemEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, DoNotFastmemMarker marker) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Xaddr = ctx.reg_alloc.ReadX(args[1]);
+ auto Rvalue = [&] {
+ if constexpr (bitsize == 128) {
+ return ctx.reg_alloc.ReadQ(args[2]);
+ } else {
+ return ctx.reg_alloc.ReadReg<std::max<std::size_t>(bitsize, 32)>(args[2]);
+ }
+ }();
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+ ctx.fpsr.Spill();
+ ctx.reg_alloc.SpillFlags();
+ RegAlloc::Realize(Xaddr, Rvalue);
+
+ SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+
+ const auto [Xbase, Xoffset] = FastmemEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback);
+ const auto fastmem_location = EmitMemoryStr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered, ShouldExt32(ctx));
+
+ ctx.deferred_emits.emplace_back([&code, &ctx, inst, marker, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end, fastmem_location] {
+ ctx.ebi.fastmem_patch_info.emplace(
+ fastmem_location - ctx.ebi.entry_point,
+ FastmemPatchInfo{
+ .marker = marker,
+ .fc = FakeCall{
+ .call_pc = mcl::bit_cast<u64>(code.xptr<void*>()),
+ },
+ .recompile = ctx.conf.recompile_on_fastmem_failure,
+ });
+
+ code.l(*fallback);
+ if constexpr (bitsize == 128) {
+ code.MOV(Xscratch0, Xaddr);
+ code.MOV(Q0.B16(), Rvalue.B16());
+ } else {
+ code.MOV(Xscratch0, Xaddr);
+ code.MOV(Xscratch1, Rvalue.toX());
+ }
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ EmitRelocation(code, ctx, WrappedWriteMemoryLinkTarget(bitsize));
+ if (ordered) {
+ code.DMB(oaknut::BarrierOp::ISH);
+ }
+ ctx.conf.emit_check_memory_abort(code, ctx, inst, *end);
+ code.B(*end);
+ });
+
+ code.l(*end);
+}
+
+} // namespace
+
+template<size_t bitsize>
+void EmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ if (const auto marker = ShouldFastmem(ctx, inst)) {
+ FastmemEmitReadMemory<bitsize>(code, ctx, inst, *marker);
+ } else if (ctx.conf.page_table_pointer != 0) {
+ InlinePageTableEmitReadMemory<bitsize>(code, ctx, inst);
+ } else {
+ CallbackOnlyEmitReadMemory<bitsize>(code, ctx, inst);
+ }
+}
+
+template<size_t bitsize>
+void EmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ CallbackOnlyEmitExclusiveReadMemory<bitsize>(code, ctx, inst);
+}
+
+template<size_t bitsize>
+void EmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ if (const auto marker = ShouldFastmem(ctx, inst)) {
+ FastmemEmitWriteMemory<bitsize>(code, ctx, inst, *marker);
+ } else if (ctx.conf.page_table_pointer != 0) {
+ InlinePageTableEmitWriteMemory<bitsize>(code, ctx, inst);
+ } else {
+ CallbackOnlyEmitWriteMemory<bitsize>(code, ctx, inst);
+ }
+}
+
+template<size_t bitsize>
+void EmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ CallbackOnlyEmitExclusiveWriteMemory<bitsize>(code, ctx, inst);
+}
+
+template void EmitReadMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitReadMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitReadMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitReadMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitReadMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveReadMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveReadMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveReadMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveReadMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveReadMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitWriteMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitWriteMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitWriteMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitWriteMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitWriteMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveWriteMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveWriteMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveWriteMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveWriteMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template void EmitExclusiveWriteMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h
new file mode 100644
index 0000000000..bfc6d71f4e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h
@@ -0,0 +1,32 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/stdint.hpp>
+
+namespace oaknut {
+struct CodeGenerator;
+struct Label;
+} // namespace oaknut
+
+namespace Dynarmic::IR {
+enum class AccType;
+class Inst;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmitContext;
+enum class LinkTarget;
+
+template<size_t bitsize>
+void EmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template<size_t bitsize>
+void EmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template<size_t bitsize>
+void EmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+template<size_t bitsize>
+void EmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp
new file mode 100644
index 0000000000..b9f13ac6f2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp
@@ -0,0 +1,409 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<typename EmitFn>
+static void EmitPackedOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ emit(Vresult, Va, Vb);
+}
+
+template<typename EmitFn>
+static void EmitSaturatedPackedOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+ ctx.fpsr.Spill();
+
+ emit(Vresult, Va, Vb);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.ADD(Vresult->B8(), Va->B8(), Vb->B8());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.CMHI(Vge->B8(), Va->B8(), Vresult->B8());
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.ADD(Vresult->B8(), Va->B8(), Vb->B8());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.SHADD(Vge->B8(), Va->B8(), Vb->B8());
+ code.CMGE(Vge->B8(), Vge->B8(), 0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.SUB(Vresult->B8(), Va->B8(), Vb->B8());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.UHSUB(Vge->B8(), Va->B8(), Vb->B8());
+ code.CMGE(Vge->B8(), Vge->B8(), 0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.SUB(Vresult->B8(), Va->B8(), Vb->B8());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.SHSUB(Vge->B8(), Va->B8(), Vb->B8());
+ code.CMGE(Vge->B8(), Vge->B8(), 0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.ADD(Vresult->H4(), Va->H4(), Vb->H4());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.CMHI(Vge->H4(), Va->H4(), Vresult->H4());
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.ADD(Vresult->H4(), Va->H4(), Vb->H4());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.SHADD(Vge->H4(), Va->H4(), Vb->H4());
+ code.CMGE(Vge->H4(), Vge->H4(), 0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.SUB(Vresult->H4(), Va->H4(), Vb->H4());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.UHSUB(Vge->H4(), Va->H4(), Vb->H4());
+ code.CMGE(Vge->H4(), Vge->H4(), 0);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ code.SUB(Vresult->H4(), Va->H4(), Vb->H4());
+
+ if (ge_inst) {
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ code.SHSUB(Vge->H4(), Va->H4(), Vb->H4());
+ code.CMGE(Vge->H4(), Vge->H4(), 0);
+ }
+}
+
+template<bool add_is_hi, bool is_signed, bool is_halving>
+static void EmitPackedAddSub(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Va = ctx.reg_alloc.ReadD(args[0]);
+ auto Vb = ctx.reg_alloc.ReadD(args[1]);
+ RegAlloc::Realize(Vresult, Va, Vb);
+
+ if (is_signed) {
+ code.SXTL(V0.S4(), Va->H4());
+ code.SXTL(V1.S4(), Vb->H4());
+ } else {
+ code.UXTL(V0.S4(), Va->H4());
+ code.UXTL(V1.S4(), Vb->H4());
+ }
+ code.EXT(V1.B8(), V1.B8(), V1.B8(), 4);
+
+ code.MOVI(D2, oaknut::RepImm{add_is_hi ? 0b11110000 : 0b00001111});
+
+ code.EOR(V1.B8(), V1.B8(), V2.B8());
+ code.SUB(V1.S2(), V1.S2(), V2.S2());
+ code.SUB(Vresult->S2(), V0.S2(), V1.S2());
+
+ if (is_halving) {
+ if (is_signed) {
+ code.SSHR(Vresult->S2(), Vresult->S2(), 1);
+ } else {
+ code.USHR(Vresult->S2(), Vresult->S2(), 1);
+ }
+ }
+
+ if (ge_inst) {
+ ASSERT(!is_halving);
+
+ auto Vge = ctx.reg_alloc.WriteD(ge_inst);
+ RegAlloc::Realize(Vge);
+
+ if (is_signed) {
+ code.CMGE(Vge->S2(), Vresult->S2(), 0);
+ code.XTN(Vge->H4(), Vge->toQ().S4());
+ } else {
+ code.CMEQ(Vge->H4(), Vresult->H4(), 0);
+ code.EOR(Vge->B8(), Vge->B8(), V2.B8());
+ code.SHRN(Vge->H4(), Vge->toQ().S4(), 16);
+ }
+ }
+
+ code.XTN(Vresult->H4(), Vresult->toQ().S4());
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<true, false, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAddSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<true, true, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<false, false, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSubAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<false, true, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHADD(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHADD(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHSUB(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHSUB(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHADD(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHADD(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHSUB(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHSUB(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<true, false, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingAddSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<true, true, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<false, false, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedHalvingSubAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedAddSub<false, true, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQADD(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQADD(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQSUB(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQSUB(Vresult->B8(), Va->B8(), Vb->B8()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQADD(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQADD(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQSUB(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSaturatedSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQSUB(Vresult->H4(), Va->H4(), Vb->H4()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedAbsDiffSumU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) {
+ code.MOVI(D2, oaknut::RepImm{0b00001111});
+ code.UABD(Vresult->B8(), Va->B8(), Vb->B8());
+ code.AND(Vresult->B8(), Vresult->B8(), V2.B8()); // TODO: Zext tracking
+ code.UADDLV(Vresult->toH(), Vresult->B8());
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::PackedSelect>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ auto Vresult = ctx.reg_alloc.WriteD(inst);
+ auto Vge = ctx.reg_alloc.ReadD(args[0]);
+ auto Va = ctx.reg_alloc.ReadD(args[1]);
+ auto Vb = ctx.reg_alloc.ReadD(args[2]);
+ RegAlloc::Realize(Vresult, Vge, Va, Vb);
+
+ code.FMOV(Vresult, Vge); // TODO: Move elimination
+ code.BSL(Vresult->B8(), Vb->B8(), Va->B8());
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp
new file mode 100644
index 0000000000..95b8b51ee9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp
@@ -0,0 +1,273 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedAddWithFlag32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+ ASSERT(overflow_inst);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wa = ctx.reg_alloc.ReadW(args[0]);
+ auto Wb = ctx.reg_alloc.ReadW(args[1]);
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ RegAlloc::Realize(Wresult, Wa, Wb, Woverflow);
+ ctx.reg_alloc.SpillFlags();
+
+ code.ADDS(Wresult, *Wa, Wb);
+ code.ASR(Wscratch0, Wresult, 31);
+ code.EOR(Wscratch0, Wscratch0, 0x8000'0000);
+ code.CSEL(Wresult, Wresult, Wscratch0, VC);
+ code.CSET(Woverflow, VS);
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedSubWithFlag32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+ ASSERT(overflow_inst);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Wa = ctx.reg_alloc.ReadW(args[0]);
+ auto Wb = ctx.reg_alloc.ReadW(args[1]);
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ RegAlloc::Realize(Wresult, Wa, Wb, Woverflow);
+ ctx.reg_alloc.SpillFlags();
+
+ code.SUBS(Wresult, *Wa, Wb);
+ code.ASR(Wscratch0, Wresult, 31);
+ code.EOR(Wscratch0, Wscratch0, 0x8000'0000);
+ code.CSEL(Wresult, Wresult, Wscratch0, VC);
+ code.CSET(Woverflow, VS);
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const size_t N = args[1].GetImmediateU8();
+ ASSERT(N >= 1 && N <= 32);
+
+ if (N == 32) {
+ ctx.reg_alloc.DefineAsExisting(inst, args[0]);
+ if (overflow_inst) {
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ RegAlloc::Realize(Woverflow);
+ code.MOV(*Woverflow, WZR);
+ }
+ return;
+ }
+
+ const u32 positive_saturated_value = (1u << (N - 1)) - 1;
+ const u32 negative_saturated_value = ~u32{0} << (N - 1);
+
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ RegAlloc::Realize(Woperand, Wresult);
+ ctx.reg_alloc.SpillFlags();
+
+ code.MOV(Wscratch0, negative_saturated_value);
+ code.MOV(Wscratch1, positive_saturated_value);
+ code.CMP(*Woperand, Wscratch0);
+ code.CSEL(Wresult, Woperand, Wscratch0, GT);
+ code.CMP(*Woperand, Wscratch1);
+ code.CSEL(Wresult, Wresult, Wscratch1, LT);
+
+ if (overflow_inst) {
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ RegAlloc::Realize(Woverflow);
+ code.CMP(*Wresult, Woperand);
+ code.CSET(Woverflow, NE);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Wresult = ctx.reg_alloc.WriteW(inst);
+ auto Woperand = ctx.reg_alloc.ReadW(args[0]);
+ RegAlloc::Realize(Wresult, Woperand);
+ ctx.reg_alloc.SpillFlags();
+
+ const size_t N = args[1].GetImmediateU8();
+ ASSERT(N <= 31);
+ const u32 saturated_value = (1u << N) - 1;
+
+ code.MOV(Wscratch0, saturated_value);
+ code.CMP(*Woperand, 0);
+ code.CSEL(Wresult, Woperand, WZR, GT);
+ code.CMP(*Woperand, Wscratch0);
+ code.CSEL(Wresult, Wresult, Wscratch0, LT);
+
+ if (overflow_inst) {
+ auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst);
+ RegAlloc::Realize(Woverflow);
+ code.CSET(Woverflow, HI);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedDoublingMultiplyReturnHigh16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedDoublingMultiplyReturnHigh32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::SignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::UnsignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp
new file mode 100644
index 0000000000..e14effca3b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp
@@ -0,0 +1,1889 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/common/always_false.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<typename EmitFn>
+static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qresult, Qoperand);
+
+ emit(Qresult, Qoperand);
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) {
+ if constexpr (size == 8) {
+ emit(Qresult->B16(), Qoperand->B16());
+ } else if constexpr (size == 16) {
+ emit(Qresult->H8(), Qoperand->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qoperand->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qoperand->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOpArranged<size>(code, ctx, inst, [&](auto Vresult, auto Voperand) {
+ ctx.fpsr.Load();
+ emit(Vresult, Voperand);
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) {
+ if constexpr (size == 8) {
+ emit(Qresult->H8(), Qoperand->toD().B8());
+ } else if constexpr (size == 16) {
+ emit(Qresult->S4(), Qoperand->toD().H4());
+ } else if constexpr (size == 32) {
+ emit(Qresult->D2(), Qoperand->toD().S2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedNarrow(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) {
+ if constexpr (size == 16) {
+ emit(Qresult->toD().B8(), Qoperand->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->toD().H4(), Qoperand->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->toD().S2(), Qoperand->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedSaturatedNarrow(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOpArrangedNarrow<size>(code, ctx, inst, [&](auto Vresult, auto Voperand) {
+ ctx.fpsr.Load();
+ emit(Vresult, Voperand);
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedPairWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) {
+ if constexpr (size == 8) {
+ emit(Qresult->H8(), Qoperand->B16());
+ } else if constexpr (size == 16) {
+ emit(Qresult->S4(), Qoperand->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->D2(), Qoperand->S4());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArrangedLower(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) {
+ if constexpr (size == 8) {
+ emit(Qresult->toD().B8(), Qoperand->toD().B8());
+ } else if constexpr (size == 16) {
+ emit(Qresult->toD().H4(), Qoperand->toD().H4());
+ } else if constexpr (size == 32) {
+ emit(Qresult->toD().S2(), Qoperand->toD().S2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<typename EmitFn>
+static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qa = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ RegAlloc::Realize(Qresult, Qa, Qb);
+
+ emit(Qresult, Qa, Qb);
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ if constexpr (size == 8) {
+ emit(Qresult->B16(), Qa->B16(), Qb->B16());
+ } else if constexpr (size == 16) {
+ emit(Qresult->H8(), Qa->H8(), Qb->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qa->S4(), Qb->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qa->D2(), Qb->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArrangedSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOpArranged<size>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) {
+ ctx.fpsr.Load();
+ emit(Vresult, Va, Vb);
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArrangedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ if constexpr (size == 8) {
+ emit(Qresult->H8(), Qa->toD().B8(), Qb->toD().B8());
+ } else if constexpr (size == 16) {
+ emit(Qresult->S4(), Qa->toD().H4(), Qb->toD().H4());
+ } else if constexpr (size == 32) {
+ emit(Qresult->D2(), Qa->toD().S2(), Qb->toD().S2());
+ } else if constexpr (size == 64) {
+ emit(Qresult->Q1(), Qa->toD().D1(), Qb->toD().D1());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArrangedSaturatedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOpArrangedWiden<size>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) {
+ ctx.fpsr.Load();
+ emit(Vresult, Va, Vb);
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArrangedLower(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ if constexpr (size == 8) {
+ emit(Qresult->toD().B8(), Qa->toD().B8(), Qb->toD().B8());
+ } else if constexpr (size == 16) {
+ emit(Qresult->toD().H4(), Qa->toD().H4(), Qb->toD().H4());
+ } else if constexpr (size == 32) {
+ emit(Qresult->toD().S2(), Qa->toD().S2(), Qb->toD().S2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitSaturatedAccumulate(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qaccumulator = ctx.reg_alloc.ReadWriteQ(args[1], inst); // NB: Swapped
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); // NB: Swapped
+ RegAlloc::Realize(Qaccumulator, Qoperand);
+ ctx.fpsr.Load();
+
+ if constexpr (size == 8) {
+ emit(Qaccumulator->B16(), Qoperand->B16());
+ } else if constexpr (size == 16) {
+ emit(Qaccumulator->H8(), Qoperand->H8());
+ } else if constexpr (size == 32) {
+ emit(Qaccumulator->S4(), Qoperand->S4());
+ } else if constexpr (size == 64) {
+ emit(Qaccumulator->D2(), Qoperand->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+}
+
+template<size_t size, typename EmitFn>
+static void EmitImmShift(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+ RegAlloc::Realize(Qresult, Qoperand);
+
+ if constexpr (size == 8) {
+ emit(Qresult->B16(), Qoperand->B16(), shift_amount);
+ } else if constexpr (size == 16) {
+ emit(Qresult->H8(), Qoperand->H8(), shift_amount);
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qoperand->S4(), shift_amount);
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qoperand->D2(), shift_amount);
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+}
+
+template<size_t size, typename EmitFn>
+static void EmitImmShiftSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitImmShift<size>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) {
+ ctx.fpsr.Load();
+ emit(Vresult, Voperand, shift_amount);
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitReduce(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Vresult = ctx.reg_alloc.WriteVec<size>(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Vresult, Qoperand);
+
+ if constexpr (size == 8) {
+ emit(Vresult, Qoperand->B16());
+ } else if constexpr (size == 16) {
+ emit(Vresult, Qoperand->H8());
+ } else if constexpr (size == 32) {
+ emit(Vresult, Qoperand->S4());
+ } else if constexpr (size == 64) {
+ emit(Vresult, Qoperand->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+}
+
+template<size_t size, typename EmitFn>
+static void EmitGetElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ auto Rresult = ctx.reg_alloc.WriteReg<std::max<size_t>(32, size)>(inst);
+ auto Qvalue = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Rresult, Qvalue);
+
+ // TODO: fpr destination
+
+ emit(Rresult, Qvalue, index);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGetElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitGetElement<8>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Belem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGetElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitGetElement<16>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Helem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGetElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitGetElement<32>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Selem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGetElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitGetElement<64>(code, ctx, inst, [&](auto& Xresult, auto& Qvalue, u8 index) { code.UMOV(Xresult, Qvalue->Delem()[index]); });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitSetElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ auto Qvector = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Rvalue = ctx.reg_alloc.ReadReg<std::max<size_t>(32, size)>(args[2]);
+ RegAlloc::Realize(Qvector, Rvalue);
+
+ // TODO: fpr source
+
+ emit(Qvector, Rvalue, index);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSetElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSetElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Belem()[index], Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSetElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSetElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Helem()[index], Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSetElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSetElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Selem()[index], Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSetElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSetElement<64>(code, ctx, inst, [&](auto& Qvector, auto& Xvalue, u8 index) { code.MOV(Qvector->Delem()[index], Xvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAbs8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAnd>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.AND(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorAndNot>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.BIC(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticShiftRight8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticShiftRight16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticVShift8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticVShift16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticVShift32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorArithmeticVShift64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitBroadcast(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qvector = ctx.reg_alloc.WriteQ(inst);
+ auto Rvalue = ctx.reg_alloc.ReadReg<std::max<size_t>(32, size)>(args[0]);
+ RegAlloc::Realize(Qvector, Rvalue);
+
+ // TODO: fpr source
+
+ emit(Qvector, Rvalue);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().B8(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().H4(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().S2(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcast8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->B16(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcast16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->H8(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcast32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->S4(), Wvalue); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcast64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcast<64>(code, ctx, inst, [&](auto& Qvector, auto& Xvalue) { code.DUP(Qvector->D2(), Xvalue); });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitBroadcastElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qvector = ctx.reg_alloc.WriteQ(inst);
+ auto Qvalue = ctx.reg_alloc.ReadQ(args[0]);
+ const u8 index = args[1].GetImmediateU8();
+ RegAlloc::Realize(Qvector, Qvalue);
+
+ emit(Qvector, Qvalue, index);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElementLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().B8(), Qvalue->Belem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElementLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().H4(), Qvalue->Helem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElementLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().S2(), Qvalue->Selem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->B16(), Qvalue->Belem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->H8(), Qvalue->Helem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->S4(), Qvalue->Selem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorBroadcastElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitBroadcastElement<64>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->D2(), Qvalue->Delem()[index]); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorCountLeadingZeros8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorCountLeadingZeros16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorCountLeadingZeros32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEven8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEven16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEven32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEven64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOddLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOddLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorDeinterleaveOddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.EOR(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEqual8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEqual16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorEqual128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorExtract>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qa = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ const u8 position = args[2].GetImmediateU8();
+ ASSERT(position % 8 == 0);
+ RegAlloc::Realize(Qresult, Qa, Qb);
+
+ code.EXT(Qresult->B16(), Qa->B16(), Qb->B16(), position / 8);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorExtractLower>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ auto Da = ctx.reg_alloc.ReadD(args[0]);
+ auto Db = ctx.reg_alloc.ReadD(args[1]);
+ const u8 position = args[2].GetImmediateU8();
+ ASSERT(position % 8 == 0);
+ RegAlloc::Realize(Dresult, Da, Db);
+
+ code.EXT(Dresult->B8(), Da->B8(), Db->B8(), position / 8);
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGreaterS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGreaterS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGreaterS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorGreaterS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingAddU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorHalvingSubU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveLower64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveUpper8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveUpper16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveUpper32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorInterleaveUpper64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftRight8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftRight16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalVShift8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalVShift16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalVShift32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorLogicalVShift64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMaxU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMinU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiply8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiply64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ ASSERT_MSG(ctx.conf.very_verbose_debugging_output, "VectorMultiply64 is for debugging only");
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ code.FMOV(Xscratch0, Qa->toD());
+ code.FMOV(Xscratch1, Qb->toD());
+ code.MUL(Xscratch0, Xscratch0, Xscratch1);
+ code.FMOV(Qresult->toD(), Xscratch0);
+ code.FMOV(Xscratch0, Qa->Delem()[1]);
+ code.FMOV(Xscratch1, Qb->Delem()[1]);
+ code.MUL(Xscratch0, Xscratch0, Xscratch1);
+ code.FMOV(Qresult->Delem()[1], Xscratch0);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplySignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplySignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplySignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorNarrow16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorNarrow32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorNarrow64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorNot>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.NOT(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorOr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ORR(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddSignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddSignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddSignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedPairWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMaxLowerU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPairedMinLowerU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPolynomialMultiply8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPolynomialMultiplyLong8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPolynomialMultiplyLong64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedWiden<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorPopulationCount>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CNT(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseBits>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.RBIT(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInHalfGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV16(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInWordGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV32(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInWordGroups16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV32(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReduceAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReduce<8>(code, ctx, inst, [&](auto& Bresult, auto Voperand) { code.ADDV(Bresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReduceAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReduce<16>(code, ctx, inst, [&](auto& Hresult, auto Voperand) { code.ADDV(Hresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReduceAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReduce<32>(code, ctx, inst, [&](auto& Sresult, auto Voperand) { code.ADDV(Sresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorReduceAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitReduce<64>(code, ctx, inst, [&](auto& Dresult, auto Voperand) { code.ADDP(Dresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRotateWholeVectorRight>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) {
+ ASSERT(shift_amount % 8 == 0);
+ const u8 ext_imm = (shift_amount % 128) / 8;
+ code.EXT(Vresult, Voperand, Voperand, ext_imm);
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingHalvingAddU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorRoundingShiftLeftU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignExtend8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignExtend16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignExtend32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignExtend64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAbs8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<8>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<16>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<32>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<64>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHigh16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULH(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHigh32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULH(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQRDMULH(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQRDMULH(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyLong16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturatedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyLong32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturatedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNeg8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShiftSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShiftSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShiftSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitImmShiftSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTable>(oaknut::CodeGenerator&, EmitContext&, IR::Inst* inst) {
+ // Do nothing. We *want* to hold on to the refcount for our arguments, so VectorTableLookup can use our arguments.
+ ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTableLookup64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
+
+ const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
+ const bool is_defaults_zero = inst->GetArg(0).IsZero();
+
+ auto Dresult = is_defaults_zero ? ctx.reg_alloc.WriteD(inst) : ctx.reg_alloc.ReadWriteD(args[0], inst);
+ auto Dindices = ctx.reg_alloc.ReadD(args[2]);
+ std::vector<RAReg<oaknut::DReg>> Dtable;
+ for (size_t i = 0; i < table_size; i++) {
+ Dtable.emplace_back(ctx.reg_alloc.ReadD(table[i]));
+ }
+ RegAlloc::Realize(Dresult, Dindices);
+ for (size_t i = 0; i < table_size; i++) {
+ RegAlloc::Realize(Dtable[i]);
+ }
+
+ switch (table_size) {
+ case 1:
+ code.MOVI(V2.B16(), 0x08);
+ code.CMGE(V2.B8(), Dindices->B8(), V2.B8());
+ code.ORR(V2.B8(), Dindices->B8(), V2.B8());
+ code.FMOV(D0, Dtable[0]);
+ if (is_defaults_zero) {
+ code.TBL(Dresult->B8(), oaknut::List{V0.B16()}, D2.B8());
+ } else {
+ code.TBX(Dresult->B8(), oaknut::List{V0.B16()}, D2.B8());
+ }
+ break;
+ case 2:
+ code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2());
+ if (is_defaults_zero) {
+ code.TBL(Dresult->B8(), oaknut::List{V0.B16()}, Dindices->B8());
+ } else {
+ code.TBX(Dresult->B8(), oaknut::List{V0.B16()}, Dindices->B8());
+ }
+ break;
+ case 3:
+ code.MOVI(V2.B16(), 0x18);
+ code.CMGE(V2.B8(), Dindices->B8(), V2.B8());
+ code.ORR(V2.B8(), Dindices->B8(), V2.B8());
+ code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2());
+ code.FMOV(D1, Dtable[2]);
+ if (is_defaults_zero) {
+ code.TBL(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, D2.B8());
+ } else {
+ code.TBX(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, D2.B8());
+ }
+ break;
+ case 4:
+ code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2());
+ code.ZIP1(V1.D2(), Dtable[2]->toQ().D2(), Dtable[3]->toQ().D2());
+ if (is_defaults_zero) {
+ code.TBL(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, Dindices->B8());
+ } else {
+ code.TBX(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, Dindices->B8());
+ }
+ break;
+ default:
+ ASSERT_FALSE("Unsupported table_size");
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTableLookup128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
+
+ const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
+ const bool is_defaults_zero = inst->GetArg(0).IsZero();
+
+ auto Qresult = is_defaults_zero ? ctx.reg_alloc.WriteQ(inst) : ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Qindices = ctx.reg_alloc.ReadQ(args[2]);
+ std::vector<RAReg<oaknut::QReg>> Qtable;
+ for (size_t i = 0; i < table_size; i++) {
+ Qtable.emplace_back(ctx.reg_alloc.ReadQ(table[i]));
+ }
+ RegAlloc::Realize(Qresult, Qindices);
+ for (size_t i = 0; i < table_size; i++) {
+ RegAlloc::Realize(Qtable[i]);
+ }
+
+ switch (table_size) {
+ case 1:
+ if (is_defaults_zero) {
+ code.TBL(Qresult->B16(), oaknut::List{Qtable[0]->B16()}, Qindices->B16());
+ } else {
+ code.TBX(Qresult->B16(), oaknut::List{Qtable[0]->B16()}, Qindices->B16());
+ }
+ break;
+ case 2:
+ code.MOV(V0.B16(), Qtable[0]->B16());
+ code.MOV(V1.B16(), Qtable[1]->B16());
+ if (is_defaults_zero) {
+ code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16()}, Qindices->B16());
+ } else {
+ code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16()}, Qindices->B16());
+ }
+ break;
+ case 3:
+ code.MOV(V0.B16(), Qtable[0]->B16());
+ code.MOV(V1.B16(), Qtable[1]->B16());
+ code.MOV(V2.B16(), Qtable[2]->B16());
+ if (is_defaults_zero) {
+ code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16()}, Qindices->B16());
+ } else {
+ code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16()}, Qindices->B16());
+ }
+ break;
+ case 4:
+ code.MOV(V0.B16(), Qtable[0]->B16());
+ code.MOV(V1.B16(), Qtable[1]->B16());
+ code.MOV(V2.B16(), Qtable[2]->B16());
+ code.MOV(V3.B16(), Qtable[3]->B16());
+ if (is_defaults_zero) {
+ code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16(), V3.B16()}, Qindices->B16());
+ } else {
+ code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16(), V3.B16()}, Qindices->B16());
+ }
+ break;
+ default:
+ ASSERT_FALSE("Unsupported table_size");
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTranspose8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool part = inst->GetArg(2).GetU1();
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTranspose16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool part = inst->GetArg(2).GetU1();
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTranspose32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool part = inst->GetArg(2).GetU1();
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorTranspose64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool part = inst->GetArg(2).GetU1();
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedRecipEstimate>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.URECPE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedRecipSqrtEstimate>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.URSQRTE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<8>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<16>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<32>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitSaturatedAccumulate<64>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorZeroExtend8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorZeroExtend16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorZeroExtend32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorZeroExtend64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { code.FMOV(Qresult->toD(), Qoperand->toD()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorZeroUpper>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { code.FMOV(Qresult->toD(), Qoperand->toD()); });
+}
+
+template<>
+void EmitIR<IR::Opcode::ZeroVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Qresult);
+ code.MOVI(Qresult->toD(), oaknut::RepImm{0});
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp
new file mode 100644
index 0000000000..e0a53bf9e4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp
@@ -0,0 +1,791 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit_cast.hpp>
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <mcl/mp/typelist/cartesian_product.hpp>
+#include <mcl/mp/typelist/get.hpp>
+#include <mcl/mp/typelist/lift_sequence.hpp>
+#include <mcl/mp/typelist/list.hpp>
+#include <mcl/mp/typelist/lower_to_tuple.hpp>
+#include <mcl/type_traits/function_info.hpp>
+#include <mcl/type_traits/integer_of_size.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/a64_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/common/always_false.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/op.h"
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/common/lut_from_list.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+namespace mp = mcl::mp;
+
+using A64FullVectorWidth = std::integral_constant<size_t, 128>;
+
+// Array alias that always sizes itself according to the given type T
+// relative to the size of a vector register. e.g. T = u32 would result
+// in a std::array<u32, 4>.
+template<typename T>
+using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>;
+
+template<typename EmitFn>
+static void MaybeStandardFPSCRValue(oaknut::CodeGenerator& code, EmitContext& ctx, bool fpcr_controlled, EmitFn emit) {
+ if (ctx.FPCR(fpcr_controlled) != ctx.FPCR()) {
+ code.MOV(Wscratch0, ctx.FPCR(fpcr_controlled).Value());
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+ emit();
+ code.MOV(Wscratch0, ctx.FPCR().Value());
+ code.MSR(oaknut::SystemReg::FPCR, Xscratch0);
+ } else {
+ emit();
+ }
+}
+
+template<typename EmitFn>
+static void EmitTwoOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qa = ctx.reg_alloc.ReadQ(args[0]);
+ const bool fpcr_controlled = args[1].IsVoid() || args[1].GetImmediateU1();
+ RegAlloc::Realize(Qresult, Qa);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { emit(Qresult, Qa); });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitTwoOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qa) {
+ if constexpr (size == 16) {
+ emit(Qresult->H8(), Qa->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qa->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qa->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<typename EmitFn>
+static void EmitThreeOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qa = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ RegAlloc::Realize(Qresult, Qa, Qb);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { emit(Qresult, Qa, Qb); });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitThreeOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ if constexpr (size == 16) {
+ emit(Qresult->H8(), Qa->H8(), Qb->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qa->S4(), Qb->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qa->D2(), Qb->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitFMA(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ auto Qm = ctx.reg_alloc.ReadQ(args[1]);
+ auto Qn = ctx.reg_alloc.ReadQ(args[2]);
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ RegAlloc::Realize(Qresult, Qm, Qn);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if constexpr (size == 16) {
+ emit(Qresult->H8(), Qm->H8(), Qn->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qm->S4(), Qn->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qm->D2(), Qn->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t size, typename EmitFn>
+static void EmitFromFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qto = ctx.reg_alloc.WriteQ(inst);
+ auto Qfrom = ctx.reg_alloc.ReadQ(args[0]);
+ const u8 fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
+ RegAlloc::Realize(Qto, Qfrom);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if constexpr (size == 32) {
+ emit(Qto->S4(), Qfrom->S4(), fbits);
+ } else if constexpr (size == 64) {
+ emit(Qto->D2(), Qfrom->D2(), fbits);
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+ });
+}
+
+template<size_t fsize, bool is_signed>
+void EmitToFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qto = ctx.reg_alloc.WriteQ(inst);
+ auto Qfrom = ctx.reg_alloc.ReadQ(args[0]);
+ const size_t fbits = args[1].GetImmediateU8();
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = inst->GetArg(3).GetU1();
+ RegAlloc::Realize(Qto, Qfrom);
+ ctx.fpsr.Load();
+
+ auto Vto = [&] {
+ if constexpr (fsize == 32) {
+ return Qto->S4();
+ } else if constexpr (fsize == 64) {
+ return Qto->D2();
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<fsize>>);
+ }
+ }();
+ auto Vfrom = [&] {
+ if constexpr (fsize == 32) {
+ return Qfrom->S4();
+ } else if constexpr (fsize == 64) {
+ return Qfrom->D2();
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<fsize>>);
+ }
+ }();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (rounding_mode == FP::RoundingMode::TowardsZero) {
+ if constexpr (is_signed) {
+ if (fbits) {
+ code.FCVTZS(Vto, Vfrom, fbits);
+ } else {
+ code.FCVTZS(Vto, Vfrom);
+ }
+ } else {
+ if (fbits) {
+ code.FCVTZU(Vto, Vfrom, fbits);
+ } else {
+ code.FCVTZU(Vto, Vfrom);
+ }
+ }
+ } else {
+ ASSERT(fbits == 0);
+ if constexpr (is_signed) {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FCVTNS(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FCVTPS(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FCVTMS(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FCVTZS(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FCVTAS(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::ToOdd:
+ ASSERT_FALSE("Unimplemented");
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ break;
+ }
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FCVTNU(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FCVTPU(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FCVTMU(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FCVTZU(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FCVTAU(Vto, Vfrom);
+ break;
+ case FP::RoundingMode::ToOdd:
+ ASSERT_FALSE("Unimplemented");
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ break;
+ }
+ }
+ }
+ });
+}
+
+template<typename Lambda>
+static void EmitTwoOpFallbackWithoutRegAlloc(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::QReg Qresult, oaknut::QReg Qarg1, Lambda lambda, bool fpcr_controlled) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+
+ const u32 fpcr = ctx.FPCR(fpcr_controlled).Value();
+ constexpr u64 stack_size = sizeof(u64) * 4; // sizeof(u128) * 2
+
+ ABI_PushRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size);
+
+ code.MOV(Xscratch0, mcl::bit_cast<u64>(fn));
+ code.ADD(X0, SP, 0 * 16);
+ code.ADD(X1, SP, 1 * 16);
+ code.MOV(X2, fpcr);
+ code.ADD(X3, Xstate, ctx.conf.state_fpsr_offset);
+ code.STR(Qarg1, X1);
+ code.BLR(Xscratch0);
+ code.LDR(Qresult, SP);
+
+ ABI_PopRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size);
+}
+
+template<size_t fpcr_controlled_arg_index = 1, typename Lambda>
+static void EmitTwoOpFallback(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qarg1 = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ RegAlloc::Realize(Qarg1, Qresult);
+ ctx.reg_alloc.SpillFlags();
+ ctx.fpsr.Spill();
+
+ const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
+ EmitTwoOpFallbackWithoutRegAlloc(code, ctx, Qresult, Qarg1, lambda, fpcr_controlled);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.ReadWriteQ(args[0], inst);
+ RegAlloc::Realize(Qresult);
+
+ code.BIC(Qresult->H8(), 0b10000000, LSL, 8);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FABS(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FABS(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FDIV(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FDIV(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorEqual16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMEQ(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorFromHalf32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+ ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Doperand = ctx.reg_alloc.ReadD(args[0]);
+ RegAlloc::Realize(Qresult, Doperand);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.FCVTL(Qresult->S4(), Doperand->H4());
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorFromSignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<32>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.SCVTF(Vto, Vfrom, fbits) : code.SCVTF(Vto, Vfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorFromSignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<64>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.SCVTF(Vto, Vfrom, fbits) : code.SCVTF(Vto, Vfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorFromUnsignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<32>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.UCVTF(Vto, Vfrom, fbits) : code.UCVTF(Vto, Vfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorFromUnsignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFromFixed<64>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.UCVTF(Vto, Vfrom, fbits) : code.UCVTF(Vto, Vfrom); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorGreater32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorGreater64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGT(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorGreaterEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGE(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorGreaterEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGE(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMax32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMax64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMaxNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMaxNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMin64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMinNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMinNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMulAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMulAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFMA<32>(code, ctx, inst, [&](auto Va, auto Vn, auto Vm) { code.FMLA(Va, Vn, Vm); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMulAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitFMA<64>(code, ctx, inst, [&](auto Va, auto Vn, auto Vm) { code.FMLA(Va, Vn, Vm); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMULX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorMulX64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMULX(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FNEG(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FNEG(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorPairedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorPairedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADDP(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorPairedAddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ code.ZIP1(V0.D2(), Qa->D2(), Qb->D2());
+ code.MOVI(D1, oaknut::RepImm{0});
+ code.FADDP(Qresult->S4(), V0.S4(), V1.S4());
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorPairedAddLower64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) {
+ code.ZIP1(V0.D2(), Qa->D2(), Qb->D2());
+ code.FADDP(Qresult->toD(), V0.D2());
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRECPE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRECPE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRecipStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRoundInt16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+ using exact_list = mp::list<std::true_type, std::false_type>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ using FPT = u16;
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
+ constexpr bool exact = mp::get<1, I>::value;
+
+ for (size_t i = 0; i < output.size(); ++i) {
+ output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
+ }
+ })};
+ },
+ mp::cartesian_product<rounding_list, exact_list>{});
+
+ EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact)));
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRoundInt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+ const bool fpcr_controlled = inst->GetArg(3).GetU1();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qresult, Qoperand);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (exact) {
+ ASSERT(ctx.FPCR(fpcr_controlled).RMode() == rounding_mode);
+ code.FRINTX(Qresult->S4(), Qoperand->S4());
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FRINTN(Qresult->S4(), Qoperand->S4());
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FRINTP(Qresult->S4(), Qoperand->S4());
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FRINTM(Qresult->S4(), Qoperand->S4());
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FRINTZ(Qresult->S4(), Qoperand->S4());
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FRINTA(Qresult->S4(), Qoperand->S4());
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ }
+ }
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRoundInt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+ const bool fpcr_controlled = inst->GetArg(3).GetU1();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Qresult, Qoperand);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (exact) {
+ ASSERT(ctx.FPCR(fpcr_controlled).RMode() == rounding_mode);
+ code.FRINTX(Qresult->D2(), Qoperand->D2());
+ } else {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ code.FRINTN(Qresult->D2(), Qoperand->D2());
+ break;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ code.FRINTP(Qresult->D2(), Qoperand->D2());
+ break;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ code.FRINTM(Qresult->D2(), Qoperand->D2());
+ break;
+ case FP::RoundingMode::TowardsZero:
+ code.FRINTZ(Qresult->D2(), Qoperand->D2());
+ break;
+ case FP::RoundingMode::ToNearest_TieAwayFromZero:
+ code.FRINTA(Qresult->D2(), Qoperand->D2());
+ break;
+ default:
+ ASSERT_FALSE("Invalid RoundingMode");
+ }
+ }
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRSQRTE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRSQRTE(Vresult, Voperand); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRSQRTS(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorRSqrtStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRSQRTS(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorSqrt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FSQRT(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorSqrt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FSQRT(Vresult, Va); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToHalf32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+ ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ auto Dresult = ctx.reg_alloc.WriteD(inst);
+ auto Qoperand = ctx.reg_alloc.ReadQ(args[0]);
+ RegAlloc::Realize(Dresult, Qoperand);
+ ctx.fpsr.Load();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.FCVTN(Dresult->H4(), Qoperand->S4());
+ });
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToSignedFixed16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToSignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToSignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, true>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToUnsignedFixed16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ (void)code;
+ (void)ctx;
+ (void)inst;
+ ASSERT_FALSE("Unimplemented");
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToUnsignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<32, false>(code, ctx, inst);
+}
+
+template<>
+void EmitIR<IR::Opcode::FPVectorToUnsignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ EmitToFixed<64, false>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp
new file mode 100644
index 0000000000..722a09176b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp
@@ -0,0 +1,126 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/a32_jitstate.h"
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/common/always_false.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+template<size_t size, typename EmitFn>
+static void Emit(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto Qresult = ctx.reg_alloc.WriteQ(inst);
+ auto Qa = ctx.reg_alloc.ReadQ(args[0]);
+ auto Qb = ctx.reg_alloc.ReadQ(args[1]);
+ RegAlloc::Realize(Qresult, Qa, Qb);
+ ctx.fpsr.Load();
+
+ if constexpr (size == 8) {
+ emit(Qresult->B16(), Qa->B16(), Qb->B16());
+ } else if constexpr (size == 16) {
+ emit(Qresult->H8(), Qa->H8(), Qb->H8());
+ } else if constexpr (size == 32) {
+ emit(Qresult->S4(), Qa->S4(), Qb->S4());
+ } else if constexpr (size == 64) {
+ emit(Qresult->D2(), Qa->D2(), Qb->D2());
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<size>>);
+ }
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorSignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); });
+}
+
+template<>
+void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
+ Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); });
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h
new file mode 100644
index 0000000000..6bfc6f3cae
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h
@@ -0,0 +1,51 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/emit_arm64.h"
+#include "dynarmic/backend/arm64/reg_alloc.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/ir/basic_block.h"
+
+namespace Dynarmic::IR {
+class Block;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmitConfig;
+class FastmemManager;
+class FpsrManager;
+
+using SharedLabel = std::shared_ptr<oaknut::Label>;
+
+inline SharedLabel GenSharedLabel() {
+ return std::make_shared<oaknut::Label>();
+}
+
+struct EmitContext {
+ IR::Block& block;
+ RegAlloc& reg_alloc;
+ const EmitConfig& conf;
+ EmittedBlockInfo& ebi;
+ FpsrManager& fpsr;
+ FastmemManager& fastmem;
+
+ std::vector<std::function<void()>> deferred_emits;
+
+ FP::FPCR FPCR(bool fpcr_controlled = true) const {
+ const FP::FPCR fpcr = conf.descriptor_to_fpcr(block.Location());
+ return fpcr_controlled ? fpcr : fpcr.ASIMDStandardValue();
+ }
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp
new file mode 100644
index 0000000000..d57a29cd85
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp
@@ -0,0 +1,58 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/interface/exclusive_monitor.h"
+
+#include <algorithm>
+
+#include <mcl/assert.hpp>
+
+namespace Dynarmic {
+
+ExclusiveMonitor::ExclusiveMonitor(size_t processor_count)
+ : exclusive_addresses(processor_count, INVALID_EXCLUSIVE_ADDRESS), exclusive_values(processor_count) {}
+
+size_t ExclusiveMonitor::GetProcessorCount() const {
+ return exclusive_addresses.size();
+}
+
+void ExclusiveMonitor::Lock() {
+ lock.Lock();
+}
+
+void ExclusiveMonitor::Unlock() {
+ lock.Unlock();
+}
+
+bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) {
+ const VAddr masked_address = address & RESERVATION_GRANULE_MASK;
+
+ Lock();
+ if (exclusive_addresses[processor_id] != masked_address) {
+ Unlock();
+ return false;
+ }
+
+ for (VAddr& other_address : exclusive_addresses) {
+ if (other_address == masked_address) {
+ other_address = INVALID_EXCLUSIVE_ADDRESS;
+ }
+ }
+ return true;
+}
+
+void ExclusiveMonitor::Clear() {
+ Lock();
+ std::fill(exclusive_addresses.begin(), exclusive_addresses.end(), INVALID_EXCLUSIVE_ADDRESS);
+ Unlock();
+}
+
+void ExclusiveMonitor::ClearProcessor(size_t processor_id) {
+ Lock();
+ exclusive_addresses[processor_id] = INVALID_EXCLUSIVE_ADDRESS;
+ Unlock();
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h b/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h
new file mode 100644
index 0000000000..8ed686aeaf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h
@@ -0,0 +1,56 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <tuple>
+
+#include <mcl/hash/xmrx.hpp>
+#include <mcl/stdint.hpp>
+#include <tsl/robin_set.h>
+
+#include "dynarmic/backend/exception_handler.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>;
+
+struct DoNotFastmemMarkerHash {
+ size_t operator()(const DoNotFastmemMarker& value) const {
+ return mcl::hash::xmrx(std::get<0>(value).Value() ^ static_cast<u64>(std::get<1>(value)));
+ }
+};
+
+struct FastmemPatchInfo {
+ DoNotFastmemMarker marker;
+ FakeCall fc;
+ bool recompile;
+};
+
+class FastmemManager {
+public:
+ explicit FastmemManager(ExceptionHandler& eh)
+ : exception_handler(eh) {}
+
+ bool SupportsFastmem() const {
+ return exception_handler.SupportsFastmem();
+ }
+
+ bool ShouldFastmem(DoNotFastmemMarker marker) const {
+ return do_not_fastmem.count(marker) == 0;
+ }
+
+ void MarkDoNotFastmem(DoNotFastmemMarker marker) {
+ do_not_fastmem.insert(marker);
+ }
+
+private:
+ ExceptionHandler& exception_handler;
+ tsl::robin_set<DoNotFastmemMarker, DoNotFastmemMarkerHash> do_not_fastmem;
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp
new file mode 100644
index 0000000000..1a1c323e65
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp
@@ -0,0 +1,40 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/abi.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+FpsrManager::FpsrManager(oaknut::CodeGenerator& code, size_t state_fpsr_offset)
+ : code{code}, state_fpsr_offset{state_fpsr_offset} {}
+
+void FpsrManager::Spill() {
+ if (!fpsr_loaded)
+ return;
+
+ code.LDR(Wscratch0, Xstate, state_fpsr_offset);
+ code.MRS(Xscratch1, oaknut::SystemReg::FPSR);
+ code.ORR(Wscratch0, Wscratch0, Wscratch1);
+ code.STR(Wscratch0, Xstate, state_fpsr_offset);
+
+ fpsr_loaded = false;
+}
+
+void FpsrManager::Load() {
+ if (fpsr_loaded)
+ return;
+
+ code.MSR(oaknut::SystemReg::FPSR, XZR);
+
+ fpsr_loaded = true;
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h
new file mode 100644
index 0000000000..e003522dde
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h
@@ -0,0 +1,30 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace oaknut {
+struct CodeGenerator;
+} // namespace oaknut
+
+namespace Dynarmic::Backend::Arm64 {
+
+class FpsrManager {
+public:
+ explicit FpsrManager(oaknut::CodeGenerator& code, size_t state_fpsr_offset);
+
+ void Spill();
+ void Load();
+ void Overwrite() { fpsr_loaded = false; }
+
+private:
+ oaknut::CodeGenerator& code;
+ size_t state_fpsr_offset;
+ bool fpsr_loaded = false;
+};
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp
new file mode 100644
index 0000000000..fedee02f07
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp
@@ -0,0 +1,613 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/reg_alloc.h"
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/backend/arm64/fpsr_manager.h"
+#include "dynarmic/backend/arm64/verbose_debugging_output.h"
+#include "dynarmic/common/always_false.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+constexpr size_t spill_offset = offsetof(StackLayout, spill);
+constexpr size_t spill_slot_size = sizeof(decltype(StackLayout::spill)::value_type);
+
+static bool IsValuelessType(IR::Type type) {
+ switch (type) {
+ case IR::Type::Table:
+ return true;
+ default:
+ return false;
+ }
+}
+
+IR::Type Argument::GetType() const {
+ return value.GetType();
+}
+
+bool Argument::IsImmediate() const {
+ return value.IsImmediate();
+}
+
+bool Argument::GetImmediateU1() const {
+ return value.GetU1();
+}
+
+u8 Argument::GetImmediateU8() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x100);
+ return u8(imm);
+}
+
+u16 Argument::GetImmediateU16() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x10000);
+ return u16(imm);
+}
+
+u32 Argument::GetImmediateU32() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x100000000);
+ return u32(imm);
+}
+
+u64 Argument::GetImmediateU64() const {
+ return value.GetImmediateAsU64();
+}
+
+IR::Cond Argument::GetImmediateCond() const {
+ ASSERT(IsImmediate() && GetType() == IR::Type::Cond);
+ return value.GetCond();
+}
+
+IR::AccType Argument::GetImmediateAccType() const {
+ ASSERT(IsImmediate() && GetType() == IR::Type::AccType);
+ return value.GetAccType();
+}
+
+HostLoc::Kind Argument::CurrentLocationKind() const {
+ return reg_alloc.ValueLocation(value.GetInst())->kind;
+}
+
+bool HostLocInfo::Contains(const IR::Inst* value) const {
+ return std::find(values.begin(), values.end(), value) != values.end();
+}
+
+void HostLocInfo::SetupScratchLocation() {
+ ASSERT(IsCompletelyEmpty());
+ realized = true;
+}
+
+void HostLocInfo::SetupLocation(const IR::Inst* value) {
+ ASSERT(IsCompletelyEmpty());
+ values.clear();
+ values.push_back(value);
+ realized = true;
+ uses_this_inst = 0;
+ accumulated_uses = 0;
+ expected_uses = value->UseCount();
+}
+
+bool HostLocInfo::IsCompletelyEmpty() const {
+ return values.empty() && !locked && !realized && !accumulated_uses && !expected_uses && !uses_this_inst;
+}
+
+bool HostLocInfo::MaybeAllocatable() const {
+ return !locked && !realized;
+}
+
+bool HostLocInfo::IsOneRemainingUse() const {
+ return accumulated_uses + 1 == expected_uses && uses_this_inst == 1;
+}
+
+void HostLocInfo::UpdateUses() {
+ accumulated_uses += uses_this_inst;
+ uses_this_inst = 0;
+
+ if (accumulated_uses == expected_uses) {
+ values.clear();
+ accumulated_uses = 0;
+ expected_uses = 0;
+ }
+}
+
+RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) {
+ ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}};
+ for (size_t i = 0; i < inst->NumArgs(); i++) {
+ const IR::Value arg = inst->GetArg(i);
+ ret[i].value = arg;
+ if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) {
+ ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined");
+ ValueInfo(arg.GetInst()).uses_this_inst++;
+ }
+ }
+ return ret;
+}
+
+bool RegAlloc::WasValueDefined(IR::Inst* inst) const {
+ return defined_insts.count(inst) > 0;
+}
+
+void RegAlloc::PrepareForCall(std::optional<Argument::copyable_reference> arg0, std::optional<Argument::copyable_reference> arg1, std::optional<Argument::copyable_reference> arg2, std::optional<Argument::copyable_reference> arg3) {
+ fpsr_manager.Spill();
+ SpillFlags();
+
+ // TODO: Spill into callee-save registers
+
+ for (int i = 0; i < 32; i++) {
+ if (mcl::bit::get_bit(i, static_cast<u32>(ABI_CALLER_SAVE))) {
+ SpillGpr(i);
+ }
+ }
+
+ for (int i = 0; i < 32; i++) {
+ if (mcl::bit::get_bit(i, static_cast<u32>(ABI_CALLER_SAVE >> 32))) {
+ SpillFpr(i);
+ }
+ }
+
+ const std::array<std::optional<Argument::copyable_reference>, 4> args{arg0, arg1, arg2, arg3};
+
+ // AAPCS64 Next General-purpose Register Number
+ int ngrn = 0;
+ // AAPCS64 Next SIMD and Floating-point Register Number
+ int nsrn = 0;
+
+ for (int i = 0; i < 4; i++) {
+ if (args[i]) {
+ if (args[i]->get().GetType() == IR::Type::U128) {
+ ASSERT(fprs[nsrn].IsCompletelyEmpty());
+ LoadCopyInto(args[i]->get().value, oaknut::QReg{nsrn});
+ nsrn++;
+ } else {
+ ASSERT(gprs[ngrn].IsCompletelyEmpty());
+ LoadCopyInto(args[i]->get().value, oaknut::XReg{ngrn});
+ ngrn++;
+ }
+ } else {
+ // Gaps are assumed to be in general-purpose registers
+ // TODO: should there be a separate list passed for FPRs instead?
+ ngrn++;
+ }
+ }
+}
+
+void RegAlloc::DefineAsExisting(IR::Inst* inst, Argument& arg) {
+ defined_insts.insert(inst);
+
+ ASSERT(!ValueLocation(inst));
+
+ if (arg.value.IsImmediate()) {
+ inst->ReplaceUsesWith(arg.value);
+ return;
+ }
+
+ auto& info = ValueInfo(arg.value.GetInst());
+ info.values.push_back(inst);
+ info.expected_uses += inst->UseCount();
+}
+
+void RegAlloc::DefineAsRegister(IR::Inst* inst, oaknut::Reg reg) {
+ defined_insts.insert(inst);
+
+ ASSERT(!ValueLocation(inst));
+ auto& info = reg.is_vector() ? fprs[reg.index()] : gprs[reg.index()];
+ ASSERT(info.IsCompletelyEmpty());
+ info.values.push_back(inst);
+ info.expected_uses += inst->UseCount();
+}
+
+void RegAlloc::UpdateAllUses() {
+ for (auto& gpr : gprs) {
+ gpr.UpdateUses();
+ }
+ for (auto& fpr : fprs) {
+ fpr.UpdateUses();
+ }
+ flags.UpdateUses();
+ for (auto& spill : spills) {
+ spill.UpdateUses();
+ }
+}
+
+void RegAlloc::AssertAllUnlocked() const {
+ const auto is_unlocked = [](const auto& i) { return !i.locked && !i.realized; };
+ ASSERT(std::all_of(gprs.begin(), gprs.end(), is_unlocked));
+ ASSERT(std::all_of(fprs.begin(), fprs.end(), is_unlocked));
+ ASSERT(is_unlocked(flags));
+ ASSERT(std::all_of(spills.begin(), spills.end(), is_unlocked));
+}
+
+void RegAlloc::AssertNoMoreUses() const {
+ const auto is_empty = [](const auto& i) { return i.IsCompletelyEmpty(); };
+ ASSERT(std::all_of(gprs.begin(), gprs.end(), is_empty));
+ ASSERT(std::all_of(fprs.begin(), fprs.end(), is_empty));
+ ASSERT(is_empty(flags));
+ ASSERT(std::all_of(spills.begin(), spills.end(), is_empty));
+}
+
+void RegAlloc::EmitVerboseDebuggingOutput() {
+ code.MOV(X19, mcl::bit_cast<u64>(&PrintVerboseDebuggingOutputLine)); // Non-volatile register
+
+ const auto do_location = [&](HostLocInfo& info, HostLocType type, size_t index) {
+ using namespace oaknut::util;
+ for (const IR::Inst* value : info.values) {
+ code.MOV(X0, SP);
+ code.MOV(X1, static_cast<u64>(type));
+ code.MOV(X2, index);
+ code.MOV(X3, value->GetName());
+ code.MOV(X4, static_cast<u64>(value->GetType()));
+ code.BLR(X19);
+ }
+ };
+
+ for (size_t i = 0; i < gprs.size(); i++) {
+ do_location(gprs[i], HostLocType::X, i);
+ }
+ for (size_t i = 0; i < fprs.size(); i++) {
+ do_location(fprs[i], HostLocType::Q, i);
+ }
+ do_location(flags, HostLocType::Nzcv, 0);
+ for (size_t i = 0; i < spills.size(); i++) {
+ do_location(spills[i], HostLocType::Spill, i);
+ }
+}
+
+template<HostLoc::Kind kind>
+int RegAlloc::GenerateImmediate(const IR::Value& value) {
+ ASSERT(value.GetType() != IR::Type::U1);
+ if constexpr (kind == HostLoc::Kind::Gpr) {
+ const int new_location_index = AllocateRegister(gprs, gpr_order);
+ SpillGpr(new_location_index);
+ gprs[new_location_index].SetupScratchLocation();
+
+ code.MOV(oaknut::XReg{new_location_index}, value.GetImmediateAsU64());
+
+ return new_location_index;
+ } else if constexpr (kind == HostLoc::Kind::Fpr) {
+ const int new_location_index = AllocateRegister(fprs, fpr_order);
+ SpillFpr(new_location_index);
+ fprs[new_location_index].SetupScratchLocation();
+
+ code.MOV(Xscratch0, value.GetImmediateAsU64());
+ code.FMOV(oaknut::DReg{new_location_index}, Xscratch0);
+
+ return new_location_index;
+ } else if constexpr (kind == HostLoc::Kind::Flags) {
+ SpillFlags();
+ flags.SetupScratchLocation();
+
+ code.MOV(Xscratch0, value.GetImmediateAsU64());
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+
+ return 0;
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>);
+ }
+}
+
+template<HostLoc::Kind required_kind>
+int RegAlloc::RealizeReadImpl(const IR::Value& value) {
+ if (value.IsImmediate()) {
+ return GenerateImmediate<required_kind>(value);
+ }
+
+ const auto current_location = ValueLocation(value.GetInst());
+ ASSERT(current_location);
+
+ if (current_location->kind == required_kind) {
+ ValueInfo(*current_location).realized = true;
+ return current_location->index;
+ }
+
+ ASSERT(!ValueInfo(*current_location).realized);
+ ASSERT(ValueInfo(*current_location).locked);
+
+ if constexpr (required_kind == HostLoc::Kind::Gpr) {
+ const int new_location_index = AllocateRegister(gprs, gpr_order);
+ SpillGpr(new_location_index);
+
+ switch (current_location->kind) {
+ case HostLoc::Kind::Gpr:
+ ASSERT_FALSE("Logic error");
+ break;
+ case HostLoc::Kind::Fpr:
+ code.FMOV(oaknut::XReg{new_location_index}, oaknut::DReg{current_location->index});
+ // ASSERT size fits
+ break;
+ case HostLoc::Kind::Spill:
+ code.LDR(oaknut::XReg{new_location_index}, SP, spill_offset + current_location->index * spill_slot_size);
+ break;
+ case HostLoc::Kind::Flags:
+ code.MRS(oaknut::XReg{new_location_index}, oaknut::SystemReg::NZCV);
+ break;
+ }
+
+ gprs[new_location_index] = std::exchange(ValueInfo(*current_location), {});
+ gprs[new_location_index].realized = true;
+ return new_location_index;
+ } else if constexpr (required_kind == HostLoc::Kind::Fpr) {
+ const int new_location_index = AllocateRegister(fprs, fpr_order);
+ SpillFpr(new_location_index);
+
+ switch (current_location->kind) {
+ case HostLoc::Kind::Gpr:
+ code.FMOV(oaknut::DReg{new_location_index}, oaknut::XReg{current_location->index});
+ break;
+ case HostLoc::Kind::Fpr:
+ ASSERT_FALSE("Logic error");
+ break;
+ case HostLoc::Kind::Spill:
+ code.LDR(oaknut::QReg{new_location_index}, SP, spill_offset + current_location->index * spill_slot_size);
+ break;
+ case HostLoc::Kind::Flags:
+ ASSERT_FALSE("Moving from flags into fprs is not currently supported");
+ break;
+ }
+
+ fprs[new_location_index] = std::exchange(ValueInfo(*current_location), {});
+ fprs[new_location_index].realized = true;
+ return new_location_index;
+ } else if constexpr (required_kind == HostLoc::Kind::Flags) {
+ ASSERT_FALSE("A simple read from flags is likely a logic error.");
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<required_kind>>);
+ }
+}
+
+template<HostLoc::Kind kind>
+int RegAlloc::RealizeWriteImpl(const IR::Inst* value) {
+ defined_insts.insert(value);
+
+ ASSERT(!ValueLocation(value));
+
+ if constexpr (kind == HostLoc::Kind::Gpr) {
+ const int new_location_index = AllocateRegister(gprs, gpr_order);
+ SpillGpr(new_location_index);
+ gprs[new_location_index].SetupLocation(value);
+ return new_location_index;
+ } else if constexpr (kind == HostLoc::Kind::Fpr) {
+ const int new_location_index = AllocateRegister(fprs, fpr_order);
+ SpillFpr(new_location_index);
+ fprs[new_location_index].SetupLocation(value);
+ return new_location_index;
+ } else if constexpr (kind == HostLoc::Kind::Flags) {
+ SpillFlags();
+ flags.SetupLocation(value);
+ return 0;
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>);
+ }
+}
+
+template<HostLoc::Kind kind>
+int RegAlloc::RealizeReadWriteImpl(const IR::Value& read_value, const IR::Inst* write_value) {
+ defined_insts.insert(write_value);
+
+ // TODO: Move elimination
+
+ const int write_loc = RealizeWriteImpl<kind>(write_value);
+
+ if constexpr (kind == HostLoc::Kind::Gpr) {
+ LoadCopyInto(read_value, oaknut::XReg{write_loc});
+ return write_loc;
+ } else if constexpr (kind == HostLoc::Kind::Fpr) {
+ LoadCopyInto(read_value, oaknut::QReg{write_loc});
+ return write_loc;
+ } else if constexpr (kind == HostLoc::Kind::Flags) {
+ ASSERT_FALSE("Incorrect function for ReadWrite of flags");
+ } else {
+ static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>);
+ }
+}
+
+template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Gpr>(const IR::Value& value);
+template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Fpr>(const IR::Value& value);
+template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Flags>(const IR::Value& value);
+template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Gpr>(const IR::Inst* value);
+template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Fpr>(const IR::Inst* value);
+template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Flags>(const IR::Inst* value);
+template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Gpr>(const IR::Value&, const IR::Inst*);
+template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Fpr>(const IR::Value&, const IR::Inst*);
+template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Flags>(const IR::Value&, const IR::Inst*);
+
+int RegAlloc::AllocateRegister(const std::array<HostLocInfo, 32>& regs, const std::vector<int>& order) const {
+ const auto empty = std::find_if(order.begin(), order.end(), [&](int i) { return regs[i].IsCompletelyEmpty(); });
+ if (empty != order.end()) {
+ return *empty;
+ }
+
+ std::vector<int> candidates;
+ std::copy_if(order.begin(), order.end(), std::back_inserter(candidates), [&](int i) { return regs[i].MaybeAllocatable(); });
+
+ // TODO: LRU
+ std::uniform_int_distribution<size_t> dis{0, candidates.size() - 1};
+ return candidates[dis(rand_gen)];
+}
+
+void RegAlloc::SpillGpr(int index) {
+ ASSERT(!gprs[index].locked && !gprs[index].realized);
+ if (gprs[index].values.empty()) {
+ return;
+ }
+ const int new_location_index = FindFreeSpill();
+ code.STR(oaknut::XReg{index}, SP, spill_offset + new_location_index * spill_slot_size);
+ spills[new_location_index] = std::exchange(gprs[index], {});
+}
+
+void RegAlloc::SpillFpr(int index) {
+ ASSERT(!fprs[index].locked && !fprs[index].realized);
+ if (fprs[index].values.empty()) {
+ return;
+ }
+ const int new_location_index = FindFreeSpill();
+ code.STR(oaknut::QReg{index}, SP, spill_offset + new_location_index * spill_slot_size);
+ spills[new_location_index] = std::exchange(fprs[index], {});
+}
+
+void RegAlloc::ReadWriteFlags(Argument& read, IR::Inst* write) {
+ defined_insts.insert(write);
+
+ const auto current_location = ValueLocation(read.value.GetInst());
+ ASSERT(current_location);
+
+ if (current_location->kind == HostLoc::Kind::Flags) {
+ if (!flags.IsOneRemainingUse()) {
+ SpillFlags();
+ }
+ } else if (current_location->kind == HostLoc::Kind::Gpr) {
+ if (!flags.values.empty()) {
+ SpillFlags();
+ }
+ code.MSR(oaknut::SystemReg::NZCV, oaknut::XReg{current_location->index});
+ } else if (current_location->kind == HostLoc::Kind::Spill) {
+ if (!flags.values.empty()) {
+ SpillFlags();
+ }
+ code.LDR(Wscratch0, SP, spill_offset + current_location->index * spill_slot_size);
+ code.MSR(oaknut::SystemReg::NZCV, Xscratch0);
+ } else {
+ ASSERT_FALSE("Invalid current location for flags");
+ }
+
+ if (write) {
+ flags.SetupLocation(write);
+ flags.realized = false;
+ }
+}
+
+void RegAlloc::SpillFlags() {
+ ASSERT(!flags.locked && !flags.realized);
+ if (flags.values.empty()) {
+ return;
+ }
+ const int new_location_index = AllocateRegister(gprs, gpr_order);
+ SpillGpr(new_location_index);
+ code.MRS(oaknut::XReg{new_location_index}, oaknut::SystemReg::NZCV);
+ gprs[new_location_index] = std::exchange(flags, {});
+}
+
+int RegAlloc::FindFreeSpill() const {
+ const auto iter = std::find_if(spills.begin(), spills.end(), [](const HostLocInfo& info) { return info.values.empty(); });
+ ASSERT_MSG(iter != spills.end(), "All spill locations are full");
+ return static_cast<int>(iter - spills.begin());
+}
+
+void RegAlloc::LoadCopyInto(const IR::Value& value, oaknut::XReg reg) {
+ if (value.IsImmediate()) {
+ code.MOV(reg, value.GetImmediateAsU64());
+ return;
+ }
+
+ const auto current_location = ValueLocation(value.GetInst());
+ ASSERT(current_location);
+ switch (current_location->kind) {
+ case HostLoc::Kind::Gpr:
+ code.MOV(reg, oaknut::XReg{current_location->index});
+ break;
+ case HostLoc::Kind::Fpr:
+ code.FMOV(reg, oaknut::DReg{current_location->index});
+ // ASSERT size fits
+ break;
+ case HostLoc::Kind::Spill:
+ code.LDR(reg, SP, spill_offset + current_location->index * spill_slot_size);
+ break;
+ case HostLoc::Kind::Flags:
+ code.MRS(reg, oaknut::SystemReg::NZCV);
+ break;
+ }
+}
+
+void RegAlloc::LoadCopyInto(const IR::Value& value, oaknut::QReg reg) {
+ if (value.IsImmediate()) {
+ code.MOV(Xscratch0, value.GetImmediateAsU64());
+ code.FMOV(reg.toD(), Xscratch0);
+ return;
+ }
+
+ const auto current_location = ValueLocation(value.GetInst());
+ ASSERT(current_location);
+ switch (current_location->kind) {
+ case HostLoc::Kind::Gpr:
+ code.FMOV(reg.toD(), oaknut::XReg{current_location->index});
+ break;
+ case HostLoc::Kind::Fpr:
+ code.MOV(reg.B16(), oaknut::QReg{current_location->index}.B16());
+ break;
+ case HostLoc::Kind::Spill:
+ // TODO: Minimize move size to max value width
+ code.LDR(reg, SP, spill_offset + current_location->index * spill_slot_size);
+ break;
+ case HostLoc::Kind::Flags:
+ ASSERT_FALSE("Moving from flags into fprs is not currently supported");
+ break;
+ }
+}
+
+std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const {
+ const auto contains_value = [value](const HostLocInfo& info) { return info.Contains(value); };
+
+ if (const auto iter = std::find_if(gprs.begin(), gprs.end(), contains_value); iter != gprs.end()) {
+ return HostLoc{HostLoc::Kind::Gpr, static_cast<int>(iter - gprs.begin())};
+ }
+ if (const auto iter = std::find_if(fprs.begin(), fprs.end(), contains_value); iter != fprs.end()) {
+ return HostLoc{HostLoc::Kind::Fpr, static_cast<int>(iter - fprs.begin())};
+ }
+ if (contains_value(flags)) {
+ return HostLoc{HostLoc::Kind::Flags, 0};
+ }
+ if (const auto iter = std::find_if(spills.begin(), spills.end(), contains_value); iter != spills.end()) {
+ return HostLoc{HostLoc::Kind::Spill, static_cast<int>(iter - spills.begin())};
+ }
+ return std::nullopt;
+}
+
+HostLocInfo& RegAlloc::ValueInfo(HostLoc host_loc) {
+ switch (host_loc.kind) {
+ case HostLoc::Kind::Gpr:
+ return gprs[static_cast<size_t>(host_loc.index)];
+ case HostLoc::Kind::Fpr:
+ return fprs[static_cast<size_t>(host_loc.index)];
+ case HostLoc::Kind::Flags:
+ return flags;
+ case HostLoc::Kind::Spill:
+ return spills[static_cast<size_t>(host_loc.index)];
+ }
+ ASSERT_FALSE("RegAlloc::ValueInfo: Invalid HostLoc::Kind");
+}
+
+HostLocInfo& RegAlloc::ValueInfo(const IR::Inst* value) {
+ const auto contains_value = [value](const HostLocInfo& info) { return info.Contains(value); };
+
+ if (const auto iter = std::find_if(gprs.begin(), gprs.end(), contains_value); iter != gprs.end()) {
+ return *iter;
+ }
+ if (const auto iter = std::find_if(fprs.begin(), fprs.end(), contains_value); iter != fprs.end()) {
+ return *iter;
+ }
+ if (contains_value(flags)) {
+ return flags;
+ }
+ if (const auto iter = std::find_if(spills.begin(), spills.end(), contains_value); iter != spills.end()) {
+ return *iter;
+ }
+ ASSERT_FALSE("RegAlloc::ValueInfo: Value not found");
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h
new file mode 100644
index 0000000000..3a2d538819
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h
@@ -0,0 +1,376 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/is_instance_of_template.hpp>
+#include <oaknut/oaknut.hpp>
+#include <tsl/robin_set.h>
+
+#include "dynarmic/backend/arm64/stack_layout.h"
+#include "dynarmic/ir/cond.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+class FpsrManager;
+class RegAlloc;
+
+struct HostLoc final {
+ enum class Kind {
+ Gpr,
+ Fpr,
+ Flags,
+ Spill,
+ } kind;
+ int index;
+};
+
+enum RWType {
+ Void,
+ Read,
+ Write,
+ ReadWrite,
+};
+
+struct Argument final {
+public:
+ using copyable_reference = std::reference_wrapper<Argument>;
+
+ IR::Type GetType() const;
+ bool IsVoid() const { return GetType() == IR::Type::Void; }
+ bool IsImmediate() const;
+
+ bool GetImmediateU1() const;
+ u8 GetImmediateU8() const;
+ u16 GetImmediateU16() const;
+ u32 GetImmediateU32() const;
+ u64 GetImmediateU64() const;
+ IR::Cond GetImmediateCond() const;
+ IR::AccType GetImmediateAccType() const;
+
+ // Only valid if not immediate
+ HostLoc::Kind CurrentLocationKind() const;
+ bool IsInGpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Gpr; }
+ bool IsInFpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Fpr; }
+
+private:
+ friend class RegAlloc;
+ explicit Argument(RegAlloc& reg_alloc)
+ : reg_alloc{reg_alloc} {}
+
+ bool allocated = false;
+ RegAlloc& reg_alloc;
+ IR::Value value;
+};
+
+struct FlagsTag final {
+private:
+ template<typename>
+ friend struct RAReg;
+
+ explicit FlagsTag(int) {}
+ int index() const { return 0; }
+};
+
+template<typename T>
+struct RAReg final {
+public:
+ static constexpr HostLoc::Kind kind = !std::is_same_v<FlagsTag, T>
+ ? std::is_base_of_v<oaknut::VReg, T>
+ ? HostLoc::Kind::Fpr
+ : HostLoc::Kind::Gpr
+ : HostLoc::Kind::Flags;
+
+ operator T() const { return reg.value(); }
+
+ operator oaknut::WRegWsp() const
+ requires(std::is_same_v<T, oaknut::WReg>)
+ {
+ return reg.value();
+ }
+
+ operator oaknut::XRegSp() const
+ requires(std::is_same_v<T, oaknut::XReg>)
+ {
+ return reg.value();
+ }
+
+ T operator*() const { return reg.value(); }
+ const T* operator->() const { return &reg.value(); }
+
+ ~RAReg();
+ RAReg(RAReg&& other)
+ : reg_alloc{other.reg_alloc}
+ , rw{std::exchange(other.rw, RWType::Void)}
+ , read_value{std::exchange(other.read_value, {})}
+ , write_value{std::exchange(other.write_value, nullptr)}
+ , reg{std::exchange(other.reg, std::nullopt)} {
+ }
+ RAReg& operator=(RAReg&&) = delete;
+
+private:
+ friend class RegAlloc;
+ explicit RAReg(RegAlloc& reg_alloc, RWType rw, const IR::Value& read_value, const IR::Inst* write_value);
+
+ RAReg(const RAReg&) = delete;
+ RAReg& operator=(const RAReg&) = delete;
+
+ void Realize();
+
+ RegAlloc& reg_alloc;
+ RWType rw;
+ IR::Value read_value;
+ const IR::Inst* write_value;
+ std::optional<T> reg;
+};
+
+struct HostLocInfo final {
+ std::vector<const IR::Inst*> values;
+ size_t locked = 0;
+ bool realized = false;
+ size_t uses_this_inst = 0;
+ size_t accumulated_uses = 0;
+ size_t expected_uses = 0;
+
+ bool Contains(const IR::Inst*) const;
+ void SetupScratchLocation();
+ void SetupLocation(const IR::Inst*);
+ bool IsCompletelyEmpty() const;
+ bool MaybeAllocatable() const;
+ bool IsOneRemainingUse() const;
+ void UpdateUses();
+};
+
+class RegAlloc final {
+public:
+ using ArgumentInfo = std::array<Argument, IR::max_arg_count>;
+
+ explicit RegAlloc(oaknut::CodeGenerator& code, FpsrManager& fpsr_manager, std::vector<int> gpr_order, std::vector<int> fpr_order)
+ : code{code}, fpsr_manager{fpsr_manager}, gpr_order{gpr_order}, fpr_order{fpr_order}, rand_gen{std::random_device{}()} {}
+
+ ArgumentInfo GetArgumentInfo(IR::Inst* inst);
+ bool WasValueDefined(IR::Inst* inst) const;
+
+ auto ReadX(Argument& arg) { return RAReg<oaknut::XReg>{*this, RWType::Read, arg.value, nullptr}; }
+ auto ReadW(Argument& arg) { return RAReg<oaknut::WReg>{*this, RWType::Read, arg.value, nullptr}; }
+
+ auto ReadQ(Argument& arg) { return RAReg<oaknut::QReg>{*this, RWType::Read, arg.value, nullptr}; }
+ auto ReadD(Argument& arg) { return RAReg<oaknut::DReg>{*this, RWType::Read, arg.value, nullptr}; }
+ auto ReadS(Argument& arg) { return RAReg<oaknut::SReg>{*this, RWType::Read, arg.value, nullptr}; }
+ auto ReadH(Argument& arg) { return RAReg<oaknut::HReg>{*this, RWType::Read, arg.value, nullptr}; }
+ auto ReadB(Argument& arg) { return RAReg<oaknut::BReg>{*this, RWType::Read, arg.value, nullptr}; }
+
+ template<size_t size>
+ auto ReadReg(Argument& arg) {
+ if constexpr (size == 64) {
+ return ReadX(arg);
+ } else if constexpr (size == 32) {
+ return ReadW(arg);
+ } else {
+ ASSERT_FALSE("Invalid size to ReadReg {}", size);
+ }
+ }
+
+ template<size_t size>
+ auto ReadVec(Argument& arg) {
+ if constexpr (size == 128) {
+ return ReadQ(arg);
+ } else if constexpr (size == 64) {
+ return ReadD(arg);
+ } else if constexpr (size == 32) {
+ return ReadS(arg);
+ } else if constexpr (size == 16) {
+ return ReadH(arg);
+ } else if constexpr (size == 8) {
+ return ReadB(arg);
+ } else {
+ ASSERT_FALSE("Invalid size to ReadVec {}", size);
+ }
+ }
+
+ auto WriteX(IR::Inst* inst) { return RAReg<oaknut::XReg>{*this, RWType::Write, {}, inst}; }
+ auto WriteW(IR::Inst* inst) { return RAReg<oaknut::WReg>{*this, RWType::Write, {}, inst}; }
+
+ auto WriteQ(IR::Inst* inst) { return RAReg<oaknut::QReg>{*this, RWType::Write, {}, inst}; }
+ auto WriteD(IR::Inst* inst) { return RAReg<oaknut::DReg>{*this, RWType::Write, {}, inst}; }
+ auto WriteS(IR::Inst* inst) { return RAReg<oaknut::SReg>{*this, RWType::Write, {}, inst}; }
+ auto WriteH(IR::Inst* inst) { return RAReg<oaknut::HReg>{*this, RWType::Write, {}, inst}; }
+ auto WriteB(IR::Inst* inst) { return RAReg<oaknut::BReg>{*this, RWType::Write, {}, inst}; }
+
+ auto WriteFlags(IR::Inst* inst) { return RAReg<FlagsTag>{*this, RWType::Write, {}, inst}; }
+
+ template<size_t size>
+ auto WriteReg(IR::Inst* inst) {
+ if constexpr (size == 64) {
+ return WriteX(inst);
+ } else if constexpr (size == 32) {
+ return WriteW(inst);
+ } else {
+ ASSERT_FALSE("Invalid size to WriteReg {}", size);
+ }
+ }
+
+ template<size_t size>
+ auto WriteVec(IR::Inst* inst) {
+ if constexpr (size == 128) {
+ return WriteQ(inst);
+ } else if constexpr (size == 64) {
+ return WriteD(inst);
+ } else if constexpr (size == 32) {
+ return WriteS(inst);
+ } else if constexpr (size == 16) {
+ return WriteH(inst);
+ } else if constexpr (size == 8) {
+ return WriteB(inst);
+ } else {
+ ASSERT_FALSE("Invalid size to WriteVec {}", size);
+ }
+ }
+
+ auto ReadWriteX(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::XReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+ auto ReadWriteW(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::WReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+
+ auto ReadWriteQ(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::QReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+ auto ReadWriteD(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::DReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+ auto ReadWriteS(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::SReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+ auto ReadWriteH(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::HReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+ auto ReadWriteB(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::BReg>{*this, RWType::ReadWrite, arg.value, inst}; }
+
+ template<size_t size>
+ auto ReadWriteReg(Argument& arg, const IR::Inst* inst) {
+ if constexpr (size == 64) {
+ return ReadWriteX(arg, inst);
+ } else if constexpr (size == 32) {
+ return ReadWriteW(arg, inst);
+ } else {
+ ASSERT_FALSE("Invalid size to ReadWriteReg {}", size);
+ }
+ }
+
+ template<size_t size>
+ auto ReadWriteVec(Argument& arg, const IR::Inst* inst) {
+ if constexpr (size == 128) {
+ return ReadWriteQ(arg, inst);
+ } else if constexpr (size == 64) {
+ return ReadWriteD(arg, inst);
+ } else if constexpr (size == 32) {
+ return ReadWriteS(arg, inst);
+ } else if constexpr (size == 16) {
+ return ReadWriteH(arg, inst);
+ } else if constexpr (size == 8) {
+ return ReadWriteB(arg, inst);
+ } else {
+ ASSERT_FALSE("Invalid size to ReadWriteVec {}", size);
+ }
+ }
+
+ void PrepareForCall(std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}, std::optional<Argument::copyable_reference> arg2 = {}, std::optional<Argument::copyable_reference> arg3 = {});
+
+ void DefineAsExisting(IR::Inst* inst, Argument& arg);
+ void DefineAsRegister(IR::Inst* inst, oaknut::Reg reg);
+
+ void ReadWriteFlags(Argument& read, IR::Inst* write);
+ void SpillFlags();
+ void SpillAll();
+
+ template<typename... Ts>
+ static void Realize(Ts&... rs) {
+ static_assert((mcl::is_instance_of_template<RAReg, Ts>() && ...));
+ (rs.Realize(), ...);
+ }
+
+ void UpdateAllUses();
+ void AssertAllUnlocked() const;
+ void AssertNoMoreUses() const;
+
+ void EmitVerboseDebuggingOutput();
+
+private:
+ friend struct Argument;
+ template<typename>
+ friend struct RAReg;
+
+ template<HostLoc::Kind kind>
+ int GenerateImmediate(const IR::Value& value);
+ template<HostLoc::Kind kind>
+ int RealizeReadImpl(const IR::Value& value);
+ template<HostLoc::Kind kind>
+ int RealizeWriteImpl(const IR::Inst* value);
+ template<HostLoc::Kind kind>
+ int RealizeReadWriteImpl(const IR::Value& read_value, const IR::Inst* write_value);
+
+ int AllocateRegister(const std::array<HostLocInfo, 32>& regs, const std::vector<int>& order) const;
+ void SpillGpr(int index);
+ void SpillFpr(int index);
+ int FindFreeSpill() const;
+
+ void LoadCopyInto(const IR::Value& value, oaknut::XReg reg);
+ void LoadCopyInto(const IR::Value& value, oaknut::QReg reg);
+
+ std::optional<HostLoc> ValueLocation(const IR::Inst* value) const;
+ HostLocInfo& ValueInfo(HostLoc host_loc);
+ HostLocInfo& ValueInfo(const IR::Inst* value);
+
+ oaknut::CodeGenerator& code;
+ FpsrManager& fpsr_manager;
+ std::vector<int> gpr_order;
+ std::vector<int> fpr_order;
+
+ std::array<HostLocInfo, 32> gprs;
+ std::array<HostLocInfo, 32> fprs;
+ HostLocInfo flags;
+ std::array<HostLocInfo, SpillCount> spills;
+
+ mutable std::mt19937 rand_gen;
+
+ tsl::robin_set<const IR::Inst*> defined_insts;
+};
+
+template<typename T>
+RAReg<T>::RAReg(RegAlloc& reg_alloc, RWType rw, const IR::Value& read_value, const IR::Inst* write_value)
+ : reg_alloc{reg_alloc}, rw{rw}, read_value{read_value}, write_value{write_value} {
+ if (rw != RWType::Write && !read_value.IsImmediate()) {
+ reg_alloc.ValueInfo(read_value.GetInst()).locked++;
+ }
+}
+
+template<typename T>
+RAReg<T>::~RAReg() {
+ if (rw != RWType::Write && !read_value.IsImmediate()) {
+ reg_alloc.ValueInfo(read_value.GetInst()).locked--;
+ }
+ if (reg) {
+ reg_alloc.ValueInfo(HostLoc{kind, reg->index()}).realized = false;
+ }
+}
+
+template<typename T>
+void RAReg<T>::Realize() {
+ switch (rw) {
+ case RWType::Read:
+ reg = T{reg_alloc.RealizeReadImpl<kind>(read_value)};
+ break;
+ case RWType::Write:
+ reg = T{reg_alloc.RealizeWriteImpl<kind>(write_value)};
+ break;
+ case RWType::ReadWrite:
+ reg = T{reg_alloc.RealizeReadWriteImpl<kind>(read_value, write_value)};
+ break;
+ default:
+ ASSERT_FALSE("Invalid RWType");
+ }
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h b/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h
new file mode 100644
index 0000000000..cf7f3259a9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h
@@ -0,0 +1,49 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::Arm64 {
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+constexpr size_t SpillCount = 64;
+
+struct alignas(16) RSBEntry {
+ u64 target;
+ u64 code_ptr;
+};
+
+constexpr size_t RSBCount = 8;
+constexpr u64 RSBIndexMask = (RSBCount - 1) * sizeof(RSBEntry);
+
+struct alignas(16) StackLayout {
+ std::array<RSBEntry, RSBCount> rsb;
+
+ std::array<std::array<u64, 2>, SpillCount> spill;
+
+ u32 rsb_ptr;
+
+ s64 cycles_to_run;
+
+ u32 save_host_fpcr;
+
+ bool check_bit;
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+static_assert(sizeof(StackLayout) % 16 == 0);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp
new file mode 100644
index 0000000000..aec0472f68
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp
@@ -0,0 +1,108 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/arm64/verbose_debugging_output.h"
+
+#include <fmt/format.h>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/emit_context.h"
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::Backend::Arm64 {
+
+using namespace oaknut::util;
+
+void EmitVerboseDebuggingOutput(oaknut::CodeGenerator& code, EmitContext& ctx) {
+ code.SUB(SP, SP, sizeof(RegisterData));
+ for (int i = 0; i < 30; i++) {
+ if (i == 18) {
+ continue; // Platform register
+ }
+ code.STR(oaknut::XReg{i}, SP, offsetof(RegisterData, x) + i * sizeof(u64));
+ }
+ for (int i = 0; i < 32; i++) {
+ code.STR(oaknut::QReg{i}, SP, offsetof(RegisterData, q) + i * sizeof(Vector));
+ }
+ code.MRS(X0, oaknut::SystemReg::NZCV);
+ code.STR(X0, SP, offsetof(RegisterData, nzcv));
+ code.ADD(X0, SP, sizeof(RegisterData) + offsetof(StackLayout, spill));
+ code.STR(X0, SP, offsetof(RegisterData, spill));
+ code.MRS(X0, oaknut::SystemReg::FPSR);
+ code.STR(X0, SP, offsetof(RegisterData, fpsr));
+
+ ctx.reg_alloc.EmitVerboseDebuggingOutput();
+
+ code.LDR(X0, SP, offsetof(RegisterData, fpsr));
+ code.MSR(oaknut::SystemReg::FPSR, X0);
+ code.LDR(X0, SP, offsetof(RegisterData, nzcv));
+ code.MSR(oaknut::SystemReg::NZCV, X0);
+ for (int i = 0; i < 32; i++) {
+ code.LDR(oaknut::QReg{i}, SP, offsetof(RegisterData, q) + i * sizeof(Vector));
+ }
+ for (int i = 0; i < 30; i++) {
+ if (i == 18) {
+ continue; // Platform register
+ }
+ code.LDR(oaknut::XReg{i}, SP, offsetof(RegisterData, x) + i * sizeof(u64));
+ }
+ code.ADD(SP, SP, sizeof(RegisterData));
+}
+
+void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLocType reg_type, size_t reg_index, size_t inst_index, IR::Type inst_type) {
+ fmt::print("dynarmic debug: %{:05} = ", inst_index);
+
+ Vector value = [&]() -> Vector {
+ switch (reg_type) {
+ case HostLocType::X:
+ return {reg_data.x[reg_index], 0};
+ case HostLocType::Q:
+ return reg_data.q[reg_index];
+ case HostLocType::Nzcv:
+ return {reg_data.nzcv, 0};
+ case HostLocType::Spill:
+ return (*reg_data.spill)[reg_index];
+ }
+ fmt::print("invalid reg_type! ");
+ return {0, 0};
+ }();
+
+ switch (inst_type) {
+ case IR::Type::U1:
+ case IR::Type::U8:
+ fmt::print("{:02x}", value[0] & 0xff);
+ break;
+ case IR::Type::U16:
+ fmt::print("{:04x}", value[0] & 0xffff);
+ break;
+ case IR::Type::U32:
+ case IR::Type::NZCVFlags:
+ fmt::print("{:08x}", value[0] & 0xffffffff);
+ break;
+ case IR::Type::U64:
+ fmt::print("{:016x}", value[0]);
+ break;
+ case IR::Type::U128:
+ fmt::print("{:016x}{:016x}", value[1], value[0]);
+ break;
+ case IR::Type::A32Reg:
+ case IR::Type::A32ExtReg:
+ case IR::Type::A64Reg:
+ case IR::Type::A64Vec:
+ case IR::Type::CoprocInfo:
+ case IR::Type::Cond:
+ case IR::Type::Void:
+ case IR::Type::Table:
+ case IR::Type::AccType:
+ case IR::Type::Opaque:
+ default:
+ fmt::print("invalid inst_type!");
+ break;
+ }
+
+ fmt::print("\n");
+}
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h
new file mode 100644
index 0000000000..73c6646139
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h
@@ -0,0 +1,56 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/arm64/stack_layout.h"
+
+namespace oaknut {
+struct CodeGenerator;
+struct Label;
+} // namespace oaknut
+
+namespace Dynarmic::IR {
+enum class Type;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::Backend::Arm64 {
+
+struct EmitContext;
+
+using Vector = std::array<u64, 2>;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+enum class HostLocType {
+ X,
+ Q,
+ Nzcv,
+ Spill,
+};
+
+struct alignas(16) RegisterData {
+ std::array<u64, 30> x;
+ std::array<Vector, 32> q;
+ u32 nzcv;
+ decltype(StackLayout::spill)* spill;
+ u32 fpsr;
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+void EmitVerboseDebuggingOutput(oaknut::CodeGenerator& code, EmitContext& ctx);
+void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLocType reg_type, size_t reg_index, size_t inst_index, IR::Type inst_type);
+
+} // namespace Dynarmic::Backend::Arm64
diff --git a/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp b/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp
new file mode 100644
index 0000000000..47e512f5ce
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp
@@ -0,0 +1,43 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/block_range_information.h"
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/icl/interval_set.hpp>
+#include <mcl/stdint.hpp>
+#include <tsl/robin_set.h>
+
+namespace Dynarmic::Backend {
+
+template<typename ProgramCounterType>
+void BlockRangeInformation<ProgramCounterType>::AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location) {
+ block_ranges.add(std::make_pair(range, std::set<IR::LocationDescriptor>{location}));
+}
+
+template<typename ProgramCounterType>
+void BlockRangeInformation<ProgramCounterType>::ClearCache() {
+ block_ranges.clear();
+}
+
+template<typename ProgramCounterType>
+tsl::robin_set<IR::LocationDescriptor> BlockRangeInformation<ProgramCounterType>::InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges) {
+ tsl::robin_set<IR::LocationDescriptor> erase_locations;
+ for (auto invalidate_interval : ranges) {
+ auto pair = block_ranges.equal_range(invalidate_interval);
+ for (auto it = pair.first; it != pair.second; ++it) {
+ for (const auto& descriptor : it->second) {
+ erase_locations.insert(descriptor);
+ }
+ }
+ }
+ // TODO: EFFICIENCY: Remove ranges that are to be erased.
+ return erase_locations;
+}
+
+template class BlockRangeInformation<u32>;
+template class BlockRangeInformation<u64>;
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/block_range_information.h b/externals/dynarmic/src/dynarmic/backend/block_range_information.h
new file mode 100644
index 0000000000..3402b155bb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/block_range_information.h
@@ -0,0 +1,29 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <set>
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/icl/interval_set.hpp>
+#include <tsl/robin_set.h>
+
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::Backend {
+
+template<typename ProgramCounterType>
+class BlockRangeInformation {
+public:
+ void AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location);
+ void ClearCache();
+ tsl::robin_set<IR::LocationDescriptor> InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges);
+
+private:
+ boost::icl::interval_map<ProgramCounterType, std::set<IR::LocationDescriptor>> block_ranges;
+};
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler.h b/externals/dynarmic/src/dynarmic/backend/exception_handler.h
new file mode 100644
index 0000000000..179433ba5d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler.h
@@ -0,0 +1,63 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include <mcl/macro/architecture.hpp>
+#include <mcl/stdint.hpp>
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+namespace Dynarmic::Backend::X64 {
+class BlockOfCode;
+} // namespace Dynarmic::Backend::X64
+#elif defined(MCL_ARCHITECTURE_ARM64)
+namespace oaknut {
+class CodeBlock;
+} // namespace oaknut
+#else
+# error "Invalid architecture"
+#endif
+
+namespace Dynarmic::Backend {
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+struct FakeCall {
+ u64 call_rip;
+ u64 ret_rip;
+};
+#elif defined(MCL_ARCHITECTURE_ARM64)
+struct FakeCall {
+ u64 call_pc;
+};
+#else
+# error "Invalid architecture"
+#endif
+
+class ExceptionHandler final {
+public:
+ ExceptionHandler();
+ ~ExceptionHandler();
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+ void Register(X64::BlockOfCode& code);
+#elif defined(MCL_ARCHITECTURE_ARM64)
+ void Register(oaknut::CodeBlock& mem, std::size_t mem_size);
+#else
+# error "Invalid architecture"
+#endif
+
+ bool SupportsFastmem() const noexcept;
+ void SetFastmemCallback(std::function<FakeCall(u64)> cb);
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> impl;
+};
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp
new file mode 100644
index 0000000000..bbaa071a5a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp
@@ -0,0 +1,36 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/exception_handler.h"
+
+namespace Dynarmic::Backend {
+
+struct ExceptionHandler::Impl final {
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+ExceptionHandler::~ExceptionHandler() = default;
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+void ExceptionHandler::Register(X64::BlockOfCode&) {
+ // Do nothing
+}
+#elif defined(MCL_ARCHITECTURE_ARM64)
+void ExceptionHandler::Register(oaknut::CodeBlock&, std::size_t) {
+ // Do nothing
+}
+#else
+# error "Invalid architecture"
+#endif
+
+bool ExceptionHandler::SupportsFastmem() const noexcept {
+ return false;
+}
+
+void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)>) {
+ // Do nothing
+}
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp
new file mode 100644
index 0000000000..bac8d9c951
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp
@@ -0,0 +1,293 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mach/mach.h>
+#include <mach/message.h>
+
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <vector>
+
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/macro/architecture.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/exception_handler.h"
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+
+# include "dynarmic/backend/x64/block_of_code.h"
+# define mig_external extern "C"
+# include "dynarmic/backend/x64/mig/mach_exc_server.h"
+
+# define THREAD_STATE x86_THREAD_STATE64
+# define THREAD_STATE_COUNT x86_THREAD_STATE64_COUNT
+
+using dynarmic_thread_state_t = x86_thread_state64_t;
+
+#elif defined(MCL_ARCHITECTURE_ARM64)
+
+# include <oaknut/code_block.hpp>
+# define mig_external extern "C"
+# include "dynarmic/backend/arm64/mig/mach_exc_server.h"
+
+# define THREAD_STATE ARM_THREAD_STATE64
+# define THREAD_STATE_COUNT ARM_THREAD_STATE64_COUNT
+
+using dynarmic_thread_state_t = arm_thread_state64_t;
+
+#endif
+
+namespace Dynarmic::Backend {
+
+namespace {
+
+struct CodeBlockInfo {
+ u64 code_begin, code_end;
+ std::function<FakeCall(u64)> cb;
+};
+
+struct MachMessage {
+ mach_msg_header_t head;
+ char data[2048]; ///< Arbitrary size
+};
+
+class MachHandler final {
+public:
+ MachHandler();
+ ~MachHandler();
+
+ kern_return_t HandleRequest(dynarmic_thread_state_t* thread_state);
+
+ void AddCodeBlock(CodeBlockInfo info);
+ void RemoveCodeBlock(u64 rip);
+
+private:
+ auto FindCodeBlockInfo(u64 rip) {
+ return std::find_if(code_block_infos.begin(), code_block_infos.end(), [&](const auto& x) { return x.code_begin <= rip && x.code_end > rip; });
+ }
+
+ std::vector<CodeBlockInfo> code_block_infos;
+ std::mutex code_block_infos_mutex;
+
+ std::thread thread;
+ mach_port_t server_port;
+
+ void MessagePump();
+};
+
+MachHandler::MachHandler() {
+#define KCHECK(x) ASSERT_MSG((x) == KERN_SUCCESS, "dynarmic: macOS MachHandler: init failure at {}", #x)
+
+ KCHECK(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &server_port));
+ KCHECK(mach_port_insert_right(mach_task_self(), server_port, server_port, MACH_MSG_TYPE_MAKE_SEND));
+ KCHECK(task_set_exception_ports(mach_task_self(), EXC_MASK_BAD_ACCESS, server_port, EXCEPTION_STATE | MACH_EXCEPTION_CODES, THREAD_STATE));
+
+ // The below doesn't actually work, and I'm not sure why; since this doesn't work we'll have a spurious error message upon shutdown.
+ mach_port_t prev;
+ KCHECK(mach_port_request_notification(mach_task_self(), server_port, MACH_NOTIFY_PORT_DESTROYED, 0, server_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &prev));
+
+#undef KCHECK
+
+ thread = std::thread(&MachHandler::MessagePump, this);
+ thread.detach();
+}
+
+MachHandler::~MachHandler() {
+ mach_port_deallocate(mach_task_self(), server_port);
+}
+
+void MachHandler::MessagePump() {
+ mach_msg_return_t mr;
+ MachMessage request;
+ MachMessage reply;
+
+ while (true) {
+ mr = mach_msg(&request.head, MACH_RCV_MSG | MACH_RCV_LARGE, 0, sizeof(request), server_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
+ if (mr != MACH_MSG_SUCCESS) {
+ fmt::print(stderr, "dynarmic: macOS MachHandler: Failed to receive mach message. error: {:#08x} ({})\n", mr, mach_error_string(mr));
+ return;
+ }
+
+ if (!mach_exc_server(&request.head, &reply.head)) {
+ fmt::print(stderr, "dynarmic: macOS MachHandler: Unexpected mach message\n");
+ return;
+ }
+
+ mr = mach_msg(&reply.head, MACH_SEND_MSG, reply.head.msgh_size, 0, MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
+ if (mr != MACH_MSG_SUCCESS) {
+ fmt::print(stderr, "dynarmic: macOS MachHandler: Failed to send mach message. error: {:#08x} ({})\n", mr, mach_error_string(mr));
+ return;
+ }
+ }
+}
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+kern_return_t MachHandler::HandleRequest(x86_thread_state64_t* ts) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+
+ const auto iter = FindCodeBlockInfo(ts->__rip);
+ if (iter == code_block_infos.end()) {
+ fmt::print(stderr, "Unhandled EXC_BAD_ACCESS at rip {:#016x}\n", ts->__rip);
+ return KERN_FAILURE;
+ }
+
+ FakeCall fc = iter->cb(ts->__rip);
+
+ ts->__rsp -= sizeof(u64);
+ *mcl::bit_cast<u64*>(ts->__rsp) = fc.ret_rip;
+ ts->__rip = fc.call_rip;
+
+ return KERN_SUCCESS;
+}
+#elif defined(MCL_ARCHITECTURE_ARM64)
+kern_return_t MachHandler::HandleRequest(arm_thread_state64_t* ts) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+
+ const auto iter = FindCodeBlockInfo(ts->__pc);
+ if (iter == code_block_infos.end()) {
+ fmt::print(stderr, "Unhandled EXC_BAD_ACCESS at pc {:#016x}\n", ts->__pc);
+ return KERN_FAILURE;
+ }
+
+ FakeCall fc = iter->cb(ts->__pc);
+
+ // TODO: Sign with ptrauth_sign_unauthenticated if pointer authentication is enabled.
+ ts->__pc = fc.call_pc;
+
+ return KERN_SUCCESS;
+}
+#endif
+
+void MachHandler::AddCodeBlock(CodeBlockInfo cbi) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+ if (auto iter = FindCodeBlockInfo(cbi.code_begin); iter != code_block_infos.end()) {
+ code_block_infos.erase(iter);
+ }
+ code_block_infos.push_back(cbi);
+}
+
+void MachHandler::RemoveCodeBlock(u64 rip) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+ const auto iter = FindCodeBlockInfo(rip);
+ if (iter == code_block_infos.end()) {
+ return;
+ }
+ code_block_infos.erase(iter);
+}
+
+std::mutex handler_lock;
+std::optional<MachHandler> mach_handler;
+
+void RegisterHandler() {
+ std::lock_guard<std::mutex> guard(handler_lock);
+ if (!mach_handler) {
+ mach_handler.emplace();
+ }
+}
+
+} // anonymous namespace
+
+mig_external kern_return_t catch_mach_exception_raise(mach_port_t, mach_port_t, mach_port_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t) {
+ fmt::print(stderr, "dynarmic: Unexpected mach message: mach_exception_raise\n");
+ return KERN_FAILURE;
+}
+
+mig_external kern_return_t catch_mach_exception_raise_state_identity(mach_port_t, mach_port_t, mach_port_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t, int*, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t*) {
+ fmt::print(stderr, "dynarmic: Unexpected mach message: mach_exception_raise_state_identity\n");
+ return KERN_FAILURE;
+}
+
+mig_external kern_return_t catch_mach_exception_raise_state(
+ mach_port_t /*exception_port*/,
+ exception_type_t exception,
+ const mach_exception_data_t /*code*/, // code[0] is as per kern_return.h, code[1] is rip.
+ mach_msg_type_number_t /*codeCnt*/,
+ int* flavor,
+ const thread_state_t old_state,
+ mach_msg_type_number_t old_stateCnt,
+ thread_state_t new_state,
+ mach_msg_type_number_t* new_stateCnt) {
+ if (!flavor || !new_stateCnt) {
+ fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Invalid arguments.\n");
+ return KERN_INVALID_ARGUMENT;
+ }
+ if (*flavor != THREAD_STATE || old_stateCnt != THREAD_STATE_COUNT || *new_stateCnt < THREAD_STATE_COUNT) {
+ fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Unexpected flavor.\n");
+ return KERN_INVALID_ARGUMENT;
+ }
+ if (exception != EXC_BAD_ACCESS) {
+ fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Unexpected exception type.\n");
+ return KERN_FAILURE;
+ }
+
+ // The input/output pointers are not necessarily 8-byte aligned.
+ dynarmic_thread_state_t ts;
+ std::memcpy(&ts, old_state, sizeof(ts));
+
+ kern_return_t ret = mach_handler->HandleRequest(&ts);
+
+ std::memcpy(new_state, &ts, sizeof(ts));
+ *new_stateCnt = THREAD_STATE_COUNT;
+ return ret;
+}
+
+struct ExceptionHandler::Impl final {
+ Impl(u64 code_begin_, u64 code_end_)
+ : code_begin(code_begin_)
+ , code_end(code_end_) {
+ RegisterHandler();
+ }
+
+ void SetCallback(std::function<FakeCall(u64)> cb) {
+ CodeBlockInfo cbi;
+ cbi.code_begin = code_begin;
+ cbi.code_end = code_end;
+ cbi.cb = cb;
+ mach_handler->AddCodeBlock(cbi);
+ }
+
+ ~Impl() {
+ mach_handler->RemoveCodeBlock(code_begin);
+ }
+
+private:
+ u64 code_begin, code_end;
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+ExceptionHandler::~ExceptionHandler() = default;
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+void ExceptionHandler::Register(X64::BlockOfCode& code) {
+ const u64 code_begin = mcl::bit_cast<u64>(code.getCode());
+ const u64 code_end = code_begin + code.GetTotalCodeSize();
+ impl = std::make_unique<Impl>(code_begin, code_end);
+}
+#elif defined(MCL_ARCHITECTURE_ARM64)
+void ExceptionHandler::Register(oaknut::CodeBlock& mem, std::size_t size) {
+ const u64 code_begin = mcl::bit_cast<u64>(mem.ptr());
+ const u64 code_end = code_begin + size;
+ impl = std::make_unique<Impl>(code_begin, code_end);
+}
+#else
+# error "Invalid architecture"
+#endif
+
+bool ExceptionHandler::SupportsFastmem() const noexcept {
+ return static_cast<bool>(impl);
+}
+
+void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) {
+ impl->SetCallback(cb);
+}
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c
new file mode 100644
index 0000000000..762a80ca42
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c
@@ -0,0 +1,14 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/macro/architecture.hpp>
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+# include "dynarmic/backend/x64/mig/mach_exc_server.c"
+#elif defined(MCL_ARCHITECTURE_ARM64)
+# include "dynarmic/backend/arm64/mig/mach_exc_server.c"
+#else
+# error "Invalid architecture"
+#endif
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp
new file mode 100644
index 0000000000..0a1c270b5e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp
@@ -0,0 +1,319 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/exception_handler.h"
+
+#ifdef __APPLE__
+# include <signal.h>
+# include <sys/ucontext.h>
+#else
+# include <signal.h>
+# ifndef __OpenBSD__
+# include <ucontext.h>
+# endif
+#endif
+
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <vector>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/stdint.hpp>
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+# include "dynarmic/backend/x64/block_of_code.h"
+#elif defined(MCL_ARCHITECTURE_ARM64)
+# include <oaknut/code_block.hpp>
+
+# include "dynarmic/backend/arm64/abi.h"
+#else
+# error "Invalid architecture"
+#endif
+
+namespace Dynarmic::Backend {
+
+namespace {
+
+struct CodeBlockInfo {
+ u64 code_begin, code_end;
+ std::function<FakeCall(u64)> cb;
+};
+
+class SigHandler {
+public:
+ SigHandler();
+ ~SigHandler();
+
+ void AddCodeBlock(CodeBlockInfo info);
+ void RemoveCodeBlock(u64 host_pc);
+
+ bool SupportsFastmem() const { return supports_fast_mem; }
+
+private:
+ auto FindCodeBlockInfo(u64 host_pc) {
+ return std::find_if(code_block_infos.begin(), code_block_infos.end(), [&](const auto& x) { return x.code_begin <= host_pc && x.code_end > host_pc; });
+ }
+
+ bool supports_fast_mem = true;
+
+ void* signal_stack_memory = nullptr;
+
+ std::vector<CodeBlockInfo> code_block_infos;
+ std::mutex code_block_infos_mutex;
+
+ struct sigaction old_sa_segv;
+ struct sigaction old_sa_bus;
+
+ static void SigAction(int sig, siginfo_t* info, void* raw_context);
+};
+
+std::mutex handler_lock;
+std::optional<SigHandler> sig_handler;
+
+void RegisterHandler() {
+ std::lock_guard<std::mutex> guard(handler_lock);
+ if (!sig_handler) {
+ sig_handler.emplace();
+ }
+}
+
+SigHandler::SigHandler() {
+ const size_t signal_stack_size = std::max<size_t>(SIGSTKSZ, 2 * 1024 * 1024);
+
+ signal_stack_memory = std::malloc(signal_stack_size);
+
+ stack_t signal_stack;
+ signal_stack.ss_sp = signal_stack_memory;
+ signal_stack.ss_size = signal_stack_size;
+ signal_stack.ss_flags = 0;
+ if (sigaltstack(&signal_stack, nullptr) != 0) {
+ fmt::print(stderr, "dynarmic: POSIX SigHandler: init failure at sigaltstack\n");
+ supports_fast_mem = false;
+ return;
+ }
+
+ struct sigaction sa;
+ sa.sa_handler = nullptr;
+ sa.sa_sigaction = &SigHandler::SigAction;
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, &old_sa_segv) != 0) {
+ fmt::print(stderr, "dynarmic: POSIX SigHandler: could not set SIGSEGV handler\n");
+ supports_fast_mem = false;
+ return;
+ }
+#ifdef __APPLE__
+ if (sigaction(SIGBUS, &sa, &old_sa_bus) != 0) {
+ fmt::print(stderr, "dynarmic: POSIX SigHandler: could not set SIGBUS handler\n");
+ supports_fast_mem = false;
+ return;
+ }
+#endif
+}
+
+SigHandler::~SigHandler() {
+ std::free(signal_stack_memory);
+}
+
+void SigHandler::AddCodeBlock(CodeBlockInfo cbi) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+ if (auto iter = FindCodeBlockInfo(cbi.code_begin); iter != code_block_infos.end()) {
+ code_block_infos.erase(iter);
+ }
+ code_block_infos.push_back(cbi);
+}
+
+void SigHandler::RemoveCodeBlock(u64 host_pc) {
+ std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+ const auto iter = FindCodeBlockInfo(host_pc);
+ if (iter == code_block_infos.end()) {
+ return;
+ }
+ code_block_infos.erase(iter);
+}
+
+void SigHandler::SigAction(int sig, siginfo_t* info, void* raw_context) {
+ ASSERT(sig == SIGSEGV || sig == SIGBUS);
+
+ ucontext_t* ucontext = reinterpret_cast<ucontext_t*>(raw_context);
+#ifndef __OpenBSD__
+ auto& mctx = ucontext->uc_mcontext;
+#endif
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+
+# if defined(__APPLE__)
+# define CTX_RIP (mctx->__ss.__rip)
+# define CTX_RSP (mctx->__ss.__rsp)
+# elif defined(__linux__)
+# define CTX_RIP (mctx.gregs[REG_RIP])
+# define CTX_RSP (mctx.gregs[REG_RSP])
+# elif defined(__FreeBSD__)
+# define CTX_RIP (mctx.mc_rip)
+# define CTX_RSP (mctx.mc_rsp)
+# elif defined(__NetBSD__)
+# define CTX_RIP (mctx.__gregs[_REG_RIP])
+# define CTX_RSP (mctx.__gregs[_REG_RSP])
+# elif defined(__OpenBSD__)
+# define CTX_RIP (ucontext->sc_rip)
+# define CTX_RSP (ucontext->sc_rsp)
+# else
+# error "Unknown platform"
+# endif
+
+ {
+ std::lock_guard<std::mutex> guard(sig_handler->code_block_infos_mutex);
+
+ const auto iter = sig_handler->FindCodeBlockInfo(CTX_RIP);
+ if (iter != sig_handler->code_block_infos.end()) {
+ FakeCall fc = iter->cb(CTX_RIP);
+
+ CTX_RSP -= sizeof(u64);
+ *mcl::bit_cast<u64*>(CTX_RSP) = fc.ret_rip;
+ CTX_RIP = fc.call_rip;
+
+ return;
+ }
+ }
+
+ fmt::print(stderr, "Unhandled {} at rip {:#018x}\n", sig == SIGSEGV ? "SIGSEGV" : "SIGBUS", CTX_RIP);
+
+#elif defined(MCL_ARCHITECTURE_ARM64)
+
+# if defined(__APPLE__)
+# define CTX_PC (mctx->__ss.__pc)
+# define CTX_SP (mctx->__ss.__sp)
+# define CTX_LR (mctx->__ss.__lr)
+# define CTX_X(i) (mctx->__ss.__x[i])
+# define CTX_Q(i) (mctx->__ns.__v[i])
+# elif defined(__linux__)
+# define CTX_PC (mctx.pc)
+# define CTX_SP (mctx.sp)
+# define CTX_LR (mctx.regs[30])
+# define CTX_X(i) (mctx.regs[i])
+# define CTX_Q(i) (fpctx->vregs[i])
+ [[maybe_unused]] const auto fpctx = [&mctx] {
+ _aarch64_ctx* header = (_aarch64_ctx*)&mctx.__reserved;
+ while (header->magic != FPSIMD_MAGIC) {
+ ASSERT(header->magic && header->size);
+ header = (_aarch64_ctx*)((char*)header + header->size);
+ }
+ return (fpsimd_context*)header;
+ }();
+# elif defined(__FreeBSD__)
+# define CTX_PC (mctx.mc_gpregs.gp_elr)
+# define CTX_SP (mctx.mc_gpregs.gp_sp)
+# define CTX_LR (mctx.mc_gpregs.gp_lr)
+# define CTX_X(i) (mctx.mc_gpregs.gp_x[i])
+# define CTX_Q(i) (mctx.mc_fpregs.fp_q[i])
+# elif defined(__NetBSD__)
+# define CTX_PC (mctx.mc_gpregs.gp_elr)
+# define CTX_SP (mctx.mc_gpregs.gp_sp)
+# define CTX_LR (mctx.mc_gpregs.gp_lr)
+# define CTX_X(i) (mctx.mc_gpregs.gp_x[i])
+# define CTX_Q(i) (mctx.mc_fpregs.fp_q[i])
+# elif defined(__OpenBSD__)
+# define CTX_PC (ucontext->sc_elr)
+# define CTX_SP (ucontext->sc_sp)
+# define CTX_LR (ucontext->sc_lr)
+# define CTX_X(i) (ucontext->sc_x[i])
+# define CTX_Q(i) (ucontext->sc_q[i])
+# else
+# error "Unknown platform"
+# endif
+
+ {
+ std::lock_guard<std::mutex> guard(sig_handler->code_block_infos_mutex);
+
+ const auto iter = sig_handler->FindCodeBlockInfo(CTX_PC);
+ if (iter != sig_handler->code_block_infos.end()) {
+ FakeCall fc = iter->cb(CTX_PC);
+
+ CTX_PC = fc.call_pc;
+
+ return;
+ }
+ }
+
+ fmt::print(stderr, "Unhandled {} at pc {:#018x}\n", sig == SIGSEGV ? "SIGSEGV" : "SIGBUS", CTX_PC);
+
+#else
+
+# error "Invalid architecture"
+
+#endif
+
+ struct sigaction* retry_sa = sig == SIGSEGV ? &sig_handler->old_sa_segv : &sig_handler->old_sa_bus;
+ if (retry_sa->sa_flags & SA_SIGINFO) {
+ retry_sa->sa_sigaction(sig, info, raw_context);
+ return;
+ }
+ if (retry_sa->sa_handler == SIG_DFL) {
+ signal(sig, SIG_DFL);
+ return;
+ }
+ if (retry_sa->sa_handler == SIG_IGN) {
+ return;
+ }
+ retry_sa->sa_handler(sig);
+}
+
+} // anonymous namespace
+
+struct ExceptionHandler::Impl final {
+ Impl(u64 code_begin_, u64 code_end_)
+ : code_begin(code_begin_)
+ , code_end(code_end_) {
+ RegisterHandler();
+ }
+
+ void SetCallback(std::function<FakeCall(u64)> cb) {
+ CodeBlockInfo cbi;
+ cbi.code_begin = code_begin;
+ cbi.code_end = code_end;
+ cbi.cb = cb;
+ sig_handler->AddCodeBlock(cbi);
+ }
+
+ ~Impl() {
+ sig_handler->RemoveCodeBlock(code_begin);
+ }
+
+private:
+ u64 code_begin, code_end;
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+ExceptionHandler::~ExceptionHandler() = default;
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+void ExceptionHandler::Register(X64::BlockOfCode& code) {
+ const u64 code_begin = mcl::bit_cast<u64>(code.getCode());
+ const u64 code_end = code_begin + code.GetTotalCodeSize();
+ impl = std::make_unique<Impl>(code_begin, code_end);
+}
+#elif defined(MCL_ARCHITECTURE_ARM64)
+void ExceptionHandler::Register(oaknut::CodeBlock& mem, std::size_t size) {
+ const u64 code_begin = mcl::bit_cast<u64>(mem.ptr());
+ const u64 code_end = code_begin + size;
+ impl = std::make_unique<Impl>(code_begin, code_end);
+}
+#else
+# error "Invalid architecture"
+#endif
+
+bool ExceptionHandler::SupportsFastmem() const noexcept {
+ return static_cast<bool>(impl) && sig_handler->SupportsFastmem();
+}
+
+void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) {
+ impl->SetCallback(cb);
+}
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp
new file mode 100644
index 0000000000..719c007594
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp
@@ -0,0 +1,14 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/macro/architecture.hpp>
+
+#if defined(MCL_ARCHITECTURE_X86_64)
+# include "dynarmic/backend/x64/exception_handler_windows.cpp"
+#elif defined(MCL_ARCHITECTURE_ARM64)
+# include "dynarmic/backend/exception_handler_generic.cpp"
+#else
+# error "Invalid architecture"
+#endif
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
new file mode 100644
index 0000000000..fc8e8bae21
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
@@ -0,0 +1,1299 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/a32_emit_x64.h"
+
+#include <algorithm>
+#include <optional>
+#include <utility>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/a32_jitstate.h"
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/backend/x64/nzcv_util.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+#include "dynarmic/common/variant_util.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/interface/A32/coprocessor.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
+// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+static Xbyak::Address MJitStateReg(A32::Reg reg) {
+ return dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * static_cast<size_t>(reg)];
+}
+
+static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) {
+ if (A32::IsSingleExtReg(reg)) {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0);
+ return dword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u32) * index];
+ }
+ if (A32::IsDoubleExtReg(reg)) {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+ return qword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u64) * index];
+ }
+ if (A32::IsQuadExtReg(reg)) {
+ const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0);
+ return xword[r15 + offsetof(A32JitState, ExtReg) + 2 * sizeof(u64) * index];
+ }
+ ASSERT_FALSE("Should never happen.");
+}
+
+A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
+ : EmitContext(reg_alloc, block), conf(conf) {}
+
+A32::LocationDescriptor A32EmitContext::Location() const {
+ return A32::LocationDescriptor{block.Location()};
+}
+
+A32::LocationDescriptor A32EmitContext::EndLocation() const {
+ return A32::LocationDescriptor{block.EndLocation()};
+}
+
+bool A32EmitContext::IsSingleStep() const {
+ return Location().SingleStepping();
+}
+
+FP::FPCR A32EmitContext::FPCR(bool fpcr_controlled) const {
+ const FP::FPCR fpcr = FP::FPCR{Location().FPSCR().Value()};
+ return fpcr_controlled ? fpcr : fpcr.ASIMDStandardValue();
+}
+
+A32EmitX64::A32EmitX64(BlockOfCode& code, A32::UserConfig conf, A32::Jit* jit_interface)
+ : EmitX64(code), conf(std::move(conf)), jit_interface(jit_interface) {
+ GenFastmemFallbacks();
+ GenTerminalHandlers();
+ code.PreludeComplete();
+ ClearFastDispatchTable();
+
+ exception_handler.SetFastmemCallback([this](u64 rip_) {
+ return FastmemCallback(rip_);
+ });
+}
+
+A32EmitX64::~A32EmitX64() = default;
+
+A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
+ if (conf.very_verbose_debugging_output) {
+ std::puts(IR::DumpBlock(block).c_str());
+ }
+
+ code.EnableWriting();
+ SCOPE_EXIT {
+ code.DisableWriting();
+ };
+
+ const std::vector<HostLoc> gpr_order = [this] {
+ std::vector<HostLoc> gprs{any_gpr};
+ if (conf.page_table) {
+ gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
+ }
+ if (conf.fastmem_pointer) {
+ gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
+ }
+ return gprs;
+ }();
+
+ RegAlloc reg_alloc{code, gpr_order, any_xmm};
+ A32EmitContext ctx{conf, reg_alloc, block};
+
+ // Start emitting.
+ code.align();
+ const u8* const entrypoint = code.getCurr();
+
+ EmitCondPrelude(ctx);
+
+ for (auto iter = block.begin(); iter != block.end(); ++iter) {
+ IR::Inst* inst = &*iter;
+
+ // Call the relevant Emit* member function.
+ switch (inst->GetOpcode()) {
+#define OPCODE(name, type, ...) \
+ case IR::Opcode::name: \
+ A32EmitX64::Emit##name(ctx, inst); \
+ break;
+#define A32OPC(name, type, ...) \
+ case IR::Opcode::A32##name: \
+ A32EmitX64::EmitA32##name(ctx, inst); \
+ break;
+#define A64OPC(...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+ default:
+ ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode());
+ break;
+ }
+
+ reg_alloc.EndOfAllocScope();
+
+ if (conf.very_verbose_debugging_output) {
+ EmitVerboseDebuggingOutput(reg_alloc);
+ }
+ }
+
+ reg_alloc.AssertNoMoreUses();
+
+ if (conf.enable_cycle_counting) {
+ EmitAddCycles(block.CycleCount());
+ }
+ EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
+ code.int3();
+
+ for (auto& deferred_emit : ctx.deferred_emits) {
+ deferred_emit();
+ }
+ code.int3();
+
+ const size_t size = static_cast<size_t>(code.getCurr() - entrypoint);
+
+ const A32::LocationDescriptor descriptor{block.Location()};
+ const A32::LocationDescriptor end_location{block.EndLocation()};
+
+ const auto range = boost::icl::discrete_interval<u32>::closed(descriptor.PC(), end_location.PC() - 1);
+ block_ranges.AddRange(range, descriptor);
+
+ return RegisterBlock(descriptor, entrypoint, size);
+}
+
+void A32EmitX64::ClearCache() {
+ EmitX64::ClearCache();
+ block_ranges.ClearCache();
+ ClearFastDispatchTable();
+ fastmem_patch_info.clear();
+}
+
+void A32EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges) {
+ InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
+}
+
+void A32EmitX64::EmitCondPrelude(const A32EmitContext& ctx) {
+ if (ctx.block.GetCondition() == IR::Cond::AL) {
+ ASSERT(!ctx.block.HasConditionFailedLocation());
+ return;
+ }
+
+ ASSERT(ctx.block.HasConditionFailedLocation());
+
+ Xbyak::Label pass = EmitCond(ctx.block.GetCondition());
+ if (conf.enable_cycle_counting) {
+ EmitAddCycles(ctx.block.ConditionFailedCycleCount());
+ }
+ EmitTerminal(IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
+ code.L(pass);
+}
+
+void A32EmitX64::ClearFastDispatchTable() {
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ fast_dispatch_table.fill({});
+ }
+}
+
+void A32EmitX64::GenTerminalHandlers() {
+ // PC ends up in ebp, location_descriptor ends up in rbx
+ const auto calculate_location_descriptor = [this] {
+ // This calculation has to match up with IREmitter::PushRSB
+ code.mov(ebx, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
+ code.shl(rbx, 32);
+ code.mov(ecx, MJitStateReg(A32::Reg::PC));
+ code.mov(ebp, ecx);
+ code.or_(rbx, rcx);
+ };
+
+ Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
+
+ code.align();
+ terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
+ calculate_location_descriptor();
+ code.mov(eax, dword[r15 + offsetof(A32JitState, rsb_ptr)]);
+ code.sub(eax, 1);
+ code.and_(eax, u32(A32JitState::RSBPtrMask));
+ code.mov(dword[r15 + offsetof(A32JitState, rsb_ptr)], eax);
+ code.cmp(rbx, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.jne(rsb_cache_miss);
+ } else {
+ code.jne(code.GetReturnFromRunCodeAddress());
+ }
+ code.mov(rax, qword[r15 + offsetof(A32JitState, rsb_codeptrs) + rax * sizeof(u64)]);
+ code.jmp(rax);
+ PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a32_terminal_handler_pop_rsb_hint");
+
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.align();
+ terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
+ calculate_location_descriptor();
+ code.L(rsb_cache_miss);
+ code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
+ code.mov(rbp, rbx);
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ code.crc32(rbp, r12);
+ }
+ code.and_(ebp, fast_dispatch_table_mask);
+ code.lea(rbp, ptr[r12 + rbp]);
+ code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
+ code.jne(fast_dispatch_cache_miss);
+ code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
+ code.L(fast_dispatch_cache_miss);
+ code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
+ code.LookupBlock();
+ code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
+ code.jmp(rax);
+ PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint");
+
+ code.align();
+ fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry& (*)(u64)>();
+ code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
+ }
+ code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
+ code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM1 + code.ABI_PARAM2]);
+ code.ret();
+ PerfMapRegister(fast_dispatch_table_lookup, code.getCurr(), "a32_fast_dispatch_table_lookup");
+ }
+}
+
+void A32EmitX64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
+ code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store);
+}
+
+void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) {
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(result, MJitStateReg(reg));
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32GetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsSingleExtReg(reg));
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movss(result, MJitStateExtReg(reg));
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32GetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg));
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movsd(result, MJitStateExtReg(reg));
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32GetVector(A32EmitContext& ctx, IR::Inst* inst) {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ if (A32::IsDoubleExtReg(reg)) {
+ code.movsd(result, MJitStateExtReg(reg));
+ } else {
+ code.movaps(result, MJitStateExtReg(reg));
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+
+ if (args[1].IsImmediate()) {
+ code.mov(MJitStateReg(reg), args[1].GetImmediateU32());
+ } else if (args[1].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ code.movd(MJitStateReg(reg), to_store);
+ } else {
+ const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ code.mov(MJitStateReg(reg), to_store);
+ }
+}
+
+void A32EmitX64::EmitA32SetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsSingleExtReg(reg));
+
+ if (args[1].IsInXmm()) {
+ Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ code.movss(MJitStateExtReg(reg), to_store);
+ } else {
+ Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ code.mov(MJitStateExtReg(reg), to_store);
+ }
+}
+
+void A32EmitX64::EmitA32SetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg));
+
+ if (args[1].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ code.movsd(MJitStateExtReg(reg), to_store);
+ } else {
+ const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]);
+ code.mov(MJitStateExtReg(reg), to_store);
+ }
+}
+
+void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ if (A32::IsDoubleExtReg(reg)) {
+ code.movsd(MJitStateExtReg(reg), to_store);
+ } else {
+ code.movaps(MJitStateExtReg(reg), to_store);
+ }
+}
+
+void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
+ // so we load them both at the same time with one 64-bit read. This allows us to
+ // extract all of their bits together at once with one pext.
+ static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
+ code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
+ code.mov(tmp.cvt64(), 0x80808080'00000003ull);
+ code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
+ code.mov(tmp, 0x000f0220);
+ code.pdep(result, result, tmp);
+ } else {
+ code.mov(result, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
+ code.imul(result, result, 0x120);
+ code.and_(result, 0x00000220);
+
+ code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
+ code.and_(tmp, 0x80808080);
+ code.imul(tmp, tmp, 0x00204081);
+ code.shr(tmp, 12);
+ code.and_(tmp, 0x000f0000);
+ code.or_(result, tmp);
+ }
+
+ code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
+ code.shl(tmp, 27);
+ code.or_(result, tmp);
+
+ code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ code.mov(tmp, NZCV::x64_mask);
+ code.pext(tmp2, tmp2, tmp);
+ code.shl(tmp2, 28);
+ } else {
+ code.and_(tmp2, NZCV::x64_mask);
+ code.imul(tmp2, tmp2, NZCV::from_x64_multiplier);
+ code.and_(tmp2, NZCV::arm_mask);
+ }
+ code.or_(result, tmp2);
+
+ code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ if (conf.always_little_endian) {
+ code.and_(cpsr, 0xFFFFFDFF);
+ }
+
+ // cpsr_q
+ code.bt(cpsr, 27);
+ code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
+
+ // cpsr_nzcv
+ code.mov(tmp, cpsr);
+ code.shr(tmp, 28);
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ code.mov(tmp2, NZCV::x64_mask);
+ code.pdep(tmp, tmp, tmp2);
+ } else {
+ code.imul(tmp, tmp, NZCV::to_x64_multiplier);
+ code.and_(tmp, NZCV::x64_mask);
+ }
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
+
+ // cpsr_jaifm
+ code.mov(tmp, cpsr);
+ code.and_(tmp, 0x010001DF);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
+
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ // cpsr_et and cpsr_ge
+ static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
+ // This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword.
+ static_assert((A32::LocationDescriptor::FPSCR_MODE_MASK & ~0x7FFF0000) == 0);
+
+ code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000));
+ code.mov(tmp, 0x000f0220);
+ code.pext(cpsr, cpsr, tmp);
+ code.mov(tmp.cvt64(), 0x01010101'00000003ull);
+ code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64());
+ // We perform SWAR partitioned subtraction here, to negate the GE bytes.
+ code.mov(tmp.cvt64(), 0x80808080'00000003ull);
+ code.mov(tmp2.cvt64(), tmp.cvt64());
+ code.sub(tmp.cvt64(), cpsr.cvt64());
+ code.xor_(tmp.cvt64(), tmp2.cvt64());
+ code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
+ } else {
+ code.and_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
+ code.mov(tmp, cpsr);
+ code.and_(tmp, 0x00000220);
+ code.imul(tmp, tmp, 0x00900000);
+ code.shr(tmp, 28);
+ code.or_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp);
+
+ code.and_(cpsr, 0x000f0000);
+ code.shr(cpsr, 16);
+ code.imul(cpsr, cpsr, 0x00204081);
+ code.and_(cpsr, 0x01010101);
+ code.mov(tmp, 0x80808080);
+ code.sub(tmp, cpsr);
+ code.xor_(tmp, 0x80808080);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
+ }
+}
+
+void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store);
+}
+
+void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (args[0].IsImmediate()) {
+ const u32 imm = args[0].GetImmediateU32();
+
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
+ } else if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.shr(a, 28);
+ code.mov(b, NZCV::x64_mask);
+ code.pdep(a, a, b);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
+ } else {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.shr(a, 28);
+ code.imul(a, a, NZCV::to_x64_multiplier);
+ code.and_(a, NZCV::x64_mask);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
+ }
+}
+
+void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (args[0].IsImmediate()) {
+ const u32 imm = args[0].GetImmediateU32();
+
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
+ } else if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.shr(a, 28);
+ code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
+ code.mov(b, NZCV::x64_mask);
+ code.pdep(a, a, b);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
+ } else {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.shr(a, 28);
+ code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
+ code.imul(a, a, NZCV::to_x64_multiplier);
+ code.and_(a, NZCV::x64_mask);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a);
+ }
+}
+
+void A32EmitX64::EmitA32SetCpsrNZ(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 nz = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1]);
+ code.and_(tmp, 1);
+ code.or_(tmp, nz);
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], tmp.cvt8());
+}
+
+void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsImmediate()) {
+ if (args[1].IsImmediate()) {
+ const bool c = args[1].GetImmediateU1();
+
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c);
+ } else {
+ const Xbyak::Reg8 c = ctx.reg_alloc.UseGpr(args[1]).cvt8();
+
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c);
+ }
+ } else {
+ const Xbyak::Reg32 nz = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ if (args[1].IsImmediate()) {
+ const bool c = args[1].GetImmediateU1();
+
+ code.or_(nz, c);
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
+ } else {
+ const Xbyak::Reg32 c = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+
+ code.or_(nz, c);
+ code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8());
+ }
+ }
+}
+
+static void EmitGetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
+ if (flag_bit != 0) {
+ code.shr(result, static_cast<int>(flag_bit));
+ }
+ code.and_(result, 1);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitGetFlag(code, ctx, inst, NZCV::x64_c_flag_bit);
+}
+
+void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (args[0].IsImmediate()) {
+ if (args[0].GetImmediateU1()) {
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_q)], 1);
+ }
+ } else {
+ const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
+
+ code.or_(code.byte[r15 + offsetof(A32JitState, cpsr_q)], to_store);
+ }
+}
+
+void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movd(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(!args[0].IsImmediate());
+
+ if (args[0].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
+ code.movd(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store);
+ } else {
+ const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store);
+ }
+}
+
+void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (args[0].IsImmediate()) {
+ const u32 imm = args[0].GetImmediateU32();
+ u32 ge = 0;
+ ge |= mcl::bit::get_bit<19>(imm) ? 0xFF000000 : 0;
+ ge |= mcl::bit::get_bit<18>(imm) ? 0x00FF0000 : 0;
+ ge |= mcl::bit::get_bit<17>(imm) ? 0x0000FF00 : 0;
+ ge |= mcl::bit::get_bit<16>(imm) ? 0x000000FF : 0;
+
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
+ } else if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(b, 0x01010101);
+ code.shr(a, 16);
+ code.pdep(a, a, b);
+ code.imul(a, a, 0xFF);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a);
+ } else {
+ const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.shr(a, 16);
+ code.and_(a, 0xF);
+ code.imul(a, a, 0x00204081);
+ code.and_(a, 0x01010101);
+ code.imul(a, a, 0xFF);
+ code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a);
+ }
+}
+
+void A32EmitX64::EmitA32DataSynchronizationBarrier(A32EmitContext&, IR::Inst*) {
+ code.mfence();
+ code.lfence();
+}
+
+void A32EmitX64::EmitA32DataMemoryBarrier(A32EmitContext&, IR::Inst*) {
+ code.mfence();
+}
+
+void A32EmitX64::EmitA32InstructionSynchronizationBarrier(A32EmitContext& ctx, IR::Inst*) {
+ if (!conf.hook_isb) {
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(nullptr);
+ Devirtualize<&A32::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code);
+}
+
+void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& arg = args[0];
+
+ const u32 upper_without_t = (ctx.EndLocation().SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE;
+
+ // Pseudocode:
+ // if (new_pc & 1) {
+ // new_pc &= 0xFFFFFFFE;
+ // cpsr.T = true;
+ // } else {
+ // new_pc &= 0xFFFFFFFC;
+ // cpsr.T = false;
+ // }
+ // We rely on the fact we disallow EFlag from changing within a block.
+
+ if (arg.IsImmediate()) {
+ const u32 new_pc = arg.GetImmediateU32();
+ const u32 mask = mcl::bit::get_bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
+ const u32 new_upper = upper_without_t | (mcl::bit::get_bit<0>(new_pc) ? 1 : 0);
+
+ code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
+ code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
+ } else {
+ const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
+ const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 new_upper = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(mask, new_pc);
+ code.and_(mask, 1);
+ code.lea(new_upper, ptr[mask.cvt64() + upper_without_t]);
+ code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
+ code.and_(new_pc, mask);
+ code.mov(MJitStateReg(A32::Reg::PC), new_pc);
+ code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
+ }
+}
+
+void A32EmitX64::EmitA32UpdateUpperLocationDescriptor(A32EmitContext& ctx, IR::Inst*) {
+ for (auto& inst : ctx.block) {
+ if (inst.GetOpcode() == IR::Opcode::A32BXWritePC) {
+ return;
+ }
+ }
+ EmitSetUpperLocationDescriptor(ctx.EndLocation(), ctx.Location());
+}
+
+void A32EmitX64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) {
+ code.SwitchMxcsrOnExit();
+
+ if (conf.enable_cycle_counting) {
+ ctx.reg_alloc.HostCall(nullptr);
+ code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
+ code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
+ Devirtualize<&A32::UserCallbacks::AddTicks>(conf.callbacks).EmitCall(code);
+ ctx.reg_alloc.EndOfAllocScope();
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
+ Devirtualize<&A32::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code);
+
+ if (conf.enable_cycle_counting) {
+ Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(conf.callbacks).EmitCall(code);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], code.ABI_RETURN);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], code.ABI_RETURN);
+ code.SwitchMxcsrOnEntry();
+ }
+}
+
+void A32EmitX64::EmitA32ExceptionRaised(A32EmitContext& ctx, IR::Inst* inst) {
+ code.SwitchMxcsrOnExit();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ if (conf.enable_cycle_counting) {
+ code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
+ code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
+ Devirtualize<&A32::UserCallbacks::AddTicks>(conf.callbacks).EmitCall(code);
+ }
+ ctx.reg_alloc.EndOfAllocScope();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[0].IsImmediate() && args[1].IsImmediate());
+ const u32 pc = args[0].GetImmediateU32();
+ const u64 exception = args[1].GetImmediateU64();
+ Devirtualize<&A32::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code, [&](RegList param) {
+ code.mov(param[0], pc);
+ code.mov(param[1], exception);
+ });
+
+ if (conf.enable_cycle_counting) {
+ Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(conf.callbacks).EmitCall(code);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], code.ABI_RETURN);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], code.ABI_RETURN);
+ code.SwitchMxcsrOnEntry();
+ }
+}
+
+static u32 GetFpscrImpl(A32JitState* jit_state) {
+ return jit_state->Fpscr();
+}
+
+void A32EmitX64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.HostCall(inst);
+ code.mov(code.ABI_PARAM1, code.r15);
+
+ code.stmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]);
+ code.CallFunction(&GetFpscrImpl);
+}
+
+static void SetFpscrImpl(u32 value, A32JitState* jit_state) {
+ jit_state->SetFpscr(value);
+}
+
+void A32EmitX64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, args[0]);
+ code.mov(code.ABI_PARAM2, code.r15);
+
+ code.CallFunction(&SetFpscrImpl);
+ code.ldmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]);
+}
+
+void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, dword[r15 + offsetof(A32JitState, fpsr_nzcv)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(tmp, NZCV::x64_mask);
+ code.pext(tmp, value, tmp);
+ code.shl(tmp, 28);
+ code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], tmp);
+
+ return;
+ }
+
+ const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.and_(value, NZCV::x64_mask);
+ code.imul(value, value, NZCV::from_x64_multiplier);
+ code.and_(value, NZCV::arm_mask);
+ code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value);
+}
+
+static void EmitCoprocessorException() {
+ ASSERT_FALSE("Should raise coproc exception here");
+}
+
+static void CallCoprocCallback(BlockOfCode& code, RegAlloc& reg_alloc, A32::Coprocessor::Callback callback, IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) {
+ reg_alloc.HostCall(inst, {}, arg0, arg1);
+
+ if (callback.user_arg) {
+ code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(*callback.user_arg));
+ }
+
+ code.CallFunction(callback.function);
+}
+
+void A32EmitX64::EmitA32CoprocInternalOperation(A32EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[5]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[6]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx.reg_alloc, *action);
+}
+
+void A32EmitX64::EmitA32CoprocSendOneWord(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[5]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx.reg_alloc, *cb, nullptr, args[1]);
+ return;
+ }
+
+ if (const auto destination_ptr = std::get_if<u32*>(&action)) {
+ const Xbyak::Reg32 reg_word = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(reg_destination_addr, reinterpret_cast<u64>(*destination_ptr));
+ code.mov(code.dword[reg_destination_addr], reg_word);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+void A32EmitX64::EmitA32CoprocSendTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc = static_cast<unsigned>(coproc_info[2]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileSendTwoWords(two, opc, CRm);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx.reg_alloc, *cb, nullptr, args[1], args[2]);
+ return;
+ }
+
+ if (const auto destination_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
+ const Xbyak::Reg32 reg_word1 = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg32 reg_word2 = ctx.reg_alloc.UseGpr(args[2]).cvt32();
+ const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(reg_destination_addr, reinterpret_cast<u64>((*destination_ptrs)[0]));
+ code.mov(code.dword[reg_destination_addr], reg_word1);
+ code.mov(reg_destination_addr, reinterpret_cast<u64>((*destination_ptrs)[1]));
+ code.mov(code.dword[reg_destination_addr], reg_word2);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+void A32EmitX64::EmitA32CoprocGetOneWord(A32EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const auto opc1 = static_cast<unsigned>(coproc_info[2]);
+ const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+ const auto opc2 = static_cast<unsigned>(coproc_info[5]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx.reg_alloc, *cb, inst);
+ return;
+ }
+
+ if (const auto source_ptr = std::get_if<u32*>(&action)) {
+ const Xbyak::Reg32 reg_word = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg64 reg_source_addr = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(reg_source_addr, reinterpret_cast<u64>(*source_ptr));
+ code.mov(reg_word, code.dword[reg_source_addr]);
+
+ ctx.reg_alloc.DefineValue(inst, reg_word);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+void A32EmitX64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const unsigned opc = coproc_info[2];
+ const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ auto action = coproc->CompileGetTwoWords(two, opc, CRm);
+
+ if (std::holds_alternative<std::monostate>(action)) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) {
+ CallCoprocCallback(code, ctx.reg_alloc, *cb, inst);
+ return;
+ }
+
+ if (const auto source_ptrs = std::get_if<std::array<u32*, 2>>(&action)) {
+ const Xbyak::Reg64 reg_result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 reg_tmp = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(reg_destination_addr, reinterpret_cast<u64>((*source_ptrs)[1]));
+ code.mov(reg_result.cvt32(), code.dword[reg_destination_addr]);
+ code.shl(reg_result, 32);
+ code.mov(reg_destination_addr, reinterpret_cast<u64>((*source_ptrs)[0]));
+ code.mov(reg_tmp.cvt32(), code.dword[reg_destination_addr]);
+ code.or_(reg_result, reg_tmp);
+
+ ctx.reg_alloc.DefineValue(inst, reg_result);
+
+ return;
+ }
+
+ UNREACHABLE();
+}
+
+void A32EmitX64::EmitA32CoprocLoadWords(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const bool long_transfer = coproc_info[2] != 0;
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const bool has_option = coproc_info[4] != 0;
+
+ std::optional<u8> option = std::nullopt;
+ if (has_option) {
+ option = coproc_info[5];
+ }
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx.reg_alloc, *action, nullptr, args[1]);
+}
+
+void A32EmitX64::EmitA32CoprocStoreWords(A32EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+ const size_t coproc_num = coproc_info[0];
+ const bool two = coproc_info[1] != 0;
+ const bool long_transfer = coproc_info[2] != 0;
+ const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+ const bool has_option = coproc_info[4] != 0;
+
+ std::optional<u8> option = std::nullopt;
+ if (has_option) {
+ option = coproc_info[5];
+ }
+
+ std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num];
+ if (!coproc) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ const auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option);
+ if (!action) {
+ EmitCoprocessorException();
+ return;
+ }
+
+ CallCoprocCallback(code, ctx.reg_alloc, *action, nullptr, args[1]);
+}
+
+std::string A32EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const {
+ const A32::LocationDescriptor descriptor{ir_descriptor};
+ return fmt::format("a32_{}{:08X}_{}_fpcr{:08X}",
+ descriptor.TFlag() ? "t" : "a",
+ descriptor.PC(),
+ descriptor.EFlag() ? "be" : "le",
+ descriptor.FPSCR().Value());
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool) {
+ ASSERT_MSG(A32::LocationDescriptor{terminal.next}.TFlag() == A32::LocationDescriptor{initial_location}.TFlag(), "Unimplemented");
+ ASSERT_MSG(A32::LocationDescriptor{terminal.next}.EFlag() == A32::LocationDescriptor{initial_location}.EFlag(), "Unimplemented");
+ ASSERT_MSG(terminal.num_instructions == 1, "Unimplemented");
+
+ code.mov(code.ABI_PARAM2.cvt32(), A32::LocationDescriptor{terminal.next}.PC());
+ code.mov(code.ABI_PARAM3.cvt32(), 1);
+ code.mov(MJitStateReg(A32::Reg::PC), code.ABI_PARAM2.cvt32());
+ code.SwitchMxcsrOnExit();
+ Devirtualize<&A32::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code);
+ code.ReturnFromRunCode(true); // TODO: Check cycles
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
+ code.ReturnFromRunCode();
+}
+
+void A32EmitX64::EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) {
+ auto get_upper = [](const IR::LocationDescriptor& desc) -> u32 {
+ return static_cast<u32>(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32);
+ };
+
+ const u32 old_upper = get_upper(old_location);
+ const u32 new_upper = [&] {
+ const u32 mask = ~u32(conf.always_little_endian ? 0x2 : 0);
+ return get_upper(new_location) & mask;
+ }();
+
+ if (old_upper != new_upper) {
+ code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper);
+ }
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ EmitSetUpperLocationDescriptor(terminal.next, initial_location);
+
+ if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
+
+ patch_information[terminal.next].jg.push_back(code.getCurr());
+ if (const auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJg(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJg(terminal.next);
+ }
+ } else {
+ code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0);
+
+ patch_information[terminal.next].jz.push_back(code.getCurr());
+ if (const auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJz(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJz(terminal.next);
+ }
+ }
+
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
+ PushRSBHelper(rax, rbx, terminal.next);
+ code.ForceReturnFromRunCode();
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ EmitSetUpperLocationDescriptor(terminal.next, initial_location);
+
+ if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC());
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ patch_information[terminal.next].jmp.push_back(code.getCurr());
+ if (const auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJmp(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJmp(terminal.next);
+ }
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) {
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ code.jmp(terminal_handler_pop_rsb_hint);
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) {
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ code.jmp(terminal_handler_fast_dispatch_hint);
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ Xbyak::Label pass = EmitCond(terminal.if_);
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+ code.L(pass);
+ EmitTerminal(terminal.then_, initial_location, is_single_step);
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ Xbyak::Label fail;
+ code.cmp(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], u8(0));
+ code.jz(fail);
+ EmitTerminal(terminal.then_, initial_location, is_single_step);
+ code.L(fail);
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A32EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0);
+ code.jne(code.GetForceReturnFromRunCodeAddress());
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A32EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jg(target_code_ptr);
+ } else {
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC());
+ code.jg(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 14);
+}
+
+void A32EmitX64::EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jz(target_code_ptr);
+ } else {
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC());
+ code.jz(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 14);
+}
+
+void A32EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jmp(target_code_ptr);
+ } else {
+ code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC());
+ code.jmp(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 13);
+}
+
+void A32EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) {
+ if (!target_code_ptr) {
+ target_code_ptr = code.GetReturnFromRunCodeAddress();
+ }
+ const CodePtr patch_location = code.getCurr();
+ code.mov(code.rcx, reinterpret_cast<u64>(target_code_ptr));
+ code.EnsurePatchLocationSize(patch_location, 10);
+}
+
+void A32EmitX64::Unpatch(const IR::LocationDescriptor& location) {
+ EmitX64::Unpatch(location);
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.DisableWriting();
+ (*fast_dispatch_table_lookup)(location.Value()) = {};
+ code.EnableWriting();
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
new file mode 100644
index 0000000000..3706b30aae
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
@@ -0,0 +1,147 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <set>
+#include <tuple>
+
+#include <tsl/robin_map.h>
+
+#include "dynarmic/backend/block_range_information.h"
+#include "dynarmic/backend/x64/a32_jitstate.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/interface/A32/a32.h"
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/ir/terminal.h"
+
+namespace Dynarmic::Backend::X64 {
+
+class RegAlloc;
+
+struct A32EmitContext final : public EmitContext {
+ A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+
+ A32::LocationDescriptor Location() const;
+ A32::LocationDescriptor EndLocation() const;
+ bool IsSingleStep() const;
+ FP::FPCR FPCR(bool fpcr_controlled = true) const override;
+
+ bool HasOptimization(OptimizationFlag flag) const override {
+ return conf.HasOptimization(flag);
+ }
+
+ const A32::UserConfig& conf;
+};
+
+class A32EmitX64 final : public EmitX64 {
+public:
+ A32EmitX64(BlockOfCode& code, A32::UserConfig conf, A32::Jit* jit_interface);
+ ~A32EmitX64() override;
+
+ /**
+ * Emit host machine code for a basic block with intermediate representation `block`.
+ * @note block is modified.
+ */
+ BlockDescriptor Emit(IR::Block& block);
+
+ void ClearCache() override;
+
+ void InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges);
+
+protected:
+ const A32::UserConfig conf;
+ A32::Jit* jit_interface;
+ BlockRangeInformation<u32> block_ranges;
+
+ void EmitCondPrelude(const A32EmitContext& ctx);
+
+ struct FastDispatchEntry {
+ u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull;
+ const void* code_ptr = nullptr;
+ };
+ static_assert(sizeof(FastDispatchEntry) == 0x10);
+ static constexpr u64 fast_dispatch_table_mask = 0xFFFF0;
+ static constexpr size_t fast_dispatch_table_size = 0x10000;
+ std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
+ void ClearFastDispatchTable();
+
+ void (*memory_read_128)() = nullptr; // Dummy
+ void (*memory_write_128)() = nullptr; // Dummy
+
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks;
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
+ void GenFastmemFallbacks();
+
+ const void* terminal_handler_pop_rsb_hint;
+ const void* terminal_handler_fast_dispatch_hint = nullptr;
+ FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;
+ void GenTerminalHandlers();
+
+ // Microinstruction emitters
+#define OPCODE(...)
+#define A32OPC(name, type, ...) void EmitA32##name(A32EmitContext& ctx, IR::Inst* inst);
+#define A64OPC(...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+ // Helpers
+ std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override;
+
+ // Fastmem information
+ using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>;
+ struct FastmemPatchInfo {
+ u64 resume_rip;
+ u64 callback;
+ DoNotFastmemMarker marker;
+ bool recompile;
+ };
+ tsl::robin_map<u64, FastmemPatchInfo> fastmem_patch_info;
+ std::set<DoNotFastmemMarker> do_not_fastmem;
+ std::optional<DoNotFastmemMarker> ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const;
+ FakeCall FastmemCallback(u64 rip);
+
+ // Memory access helpers
+ void EmitCheckMemoryAbort(A32EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end = nullptr);
+ template<std::size_t bitsize, auto callback>
+ void EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst);
+
+ // Terminal instruction emitters
+ void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location);
+ void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+
+ // Patching
+ void Unpatch(const IR::LocationDescriptor& target_desc) override;
+ void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) override;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp
new file mode 100644
index 0000000000..f2919485be
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp
@@ -0,0 +1,259 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+#include <initializer_list>
+#include <tuple>
+#include <utility>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/a32_emit_x64.h"
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/emit_x64_memory.h"
+#include "dynarmic/backend/x64/exclusive_monitor_friend.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/common/x64_disassemble.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void A32EmitX64::GenFastmemFallbacks() {
+ const std::initializer_list<int> idxes{0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
+ const std::array<std::pair<size_t, ArgCallback>, 4> read_callbacks{{
+ {8, Devirtualize<&A32::UserCallbacks::MemoryRead8>(conf.callbacks)},
+ {16, Devirtualize<&A32::UserCallbacks::MemoryRead16>(conf.callbacks)},
+ {32, Devirtualize<&A32::UserCallbacks::MemoryRead32>(conf.callbacks)},
+ {64, Devirtualize<&A32::UserCallbacks::MemoryRead64>(conf.callbacks)},
+ }};
+ const std::array<std::pair<size_t, ArgCallback>, 4> write_callbacks{{
+ {8, Devirtualize<&A32::UserCallbacks::MemoryWrite8>(conf.callbacks)},
+ {16, Devirtualize<&A32::UserCallbacks::MemoryWrite16>(conf.callbacks)},
+ {32, Devirtualize<&A32::UserCallbacks::MemoryWrite32>(conf.callbacks)},
+ {64, Devirtualize<&A32::UserCallbacks::MemoryWrite64>(conf.callbacks)},
+ }};
+ const std::array<std::pair<size_t, ArgCallback>, 4> exclusive_write_callbacks{{
+ {8, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)},
+ {16, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)},
+ {32, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)},
+ {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)},
+ }};
+
+ for (bool ordered : {false, true}) {
+ for (int vaddr_idx : idxes) {
+ for (int value_idx : idxes) {
+ for (const auto& [bitsize, callback] : read_callbacks) {
+ code.align();
+ read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ if (ordered) {
+ code.mfence();
+ }
+ callback.EmitCall(code);
+ if (value_idx != code.ABI_RETURN.getIdx()) {
+ code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN);
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
+ code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx});
+ code.ret();
+ PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize));
+ }
+
+ for (const auto& [bitsize, callback] : write_callbacks) {
+ code.align();
+ write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStack(code);
+ if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) {
+ code.xchg(code.ABI_PARAM2, code.ABI_PARAM3);
+ } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ } else {
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ }
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM3);
+ callback.EmitCall(code);
+ if (ordered) {
+ code.mfence();
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStack(code);
+ code.ret();
+ PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize));
+ }
+
+ for (const auto& [bitsize, callback] : exclusive_write_callbacks) {
+ code.align();
+ exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) {
+ code.xchg(code.ABI_PARAM2, code.ABI_PARAM3);
+ } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ } else {
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ }
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM3);
+ code.mov(code.ABI_PARAM4, rax);
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM4);
+ callback.EmitCall(code);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ code.ret();
+ PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize));
+ }
+ }
+ }
+ }
+}
+
+#define Axx A32
+#include "dynarmic/backend/x64/emit_x64_memory.cpp.inc"
+#undef Axx
+
+void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<8, &A32::UserCallbacks::MemoryWrite8>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<16, &A32::UserCallbacks::MemoryWrite16>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<32, &A32::UserCallbacks::MemoryWrite32>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst);
+}
+
+void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) {
+ code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0));
+}
+
+void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst);
+ }
+}
+
+void A32EmitX64::EmitCheckMemoryAbort(A32EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end) {
+ if (!conf.check_halt_on_memory_access) {
+ return;
+ }
+
+ Xbyak::Label skip;
+
+ const A32::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
+
+ code.test(dword[r15 + offsetof(A32JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
+ if (end) {
+ code.jz(*end, code.T_NEAR);
+ } else {
+ code.jz(skip, code.T_NEAR);
+ }
+ EmitSetUpperLocationDescriptor(current_location, ctx.Location());
+ code.mov(dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * 15], current_location.PC());
+ code.ForceReturnFromRunCode();
+ code.L(skip);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
new file mode 100644
index 0000000000..84f84f5db9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
@@ -0,0 +1,343 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include <boost/icl/interval_set.hpp>
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/a32_emit_x64.h"
+#include "dynarmic/backend/x64/a32_jitstate.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/callback.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/jitstate_info.h"
+#include "dynarmic/common/atomic.h"
+#include "dynarmic/common/x64_disassemble.h"
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+#include "dynarmic/interface/A32/a32.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/location_descriptor.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::A32 {
+
+using namespace Backend::X64;
+
+static RunCodeCallbacks GenRunCodeCallbacks(A32::UserCallbacks* cb, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg, const A32::UserConfig& conf) {
+ return RunCodeCallbacks{
+ std::make_unique<ArgCallback>(LookupBlock, reinterpret_cast<u64>(arg)),
+ std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::AddTicks>(cb)),
+ std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(cb)),
+ conf.enable_cycle_counting,
+ };
+}
+
+static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) {
+ return [conf](BlockOfCode& code) {
+ if (conf.page_table) {
+ code.mov(code.r14, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.mov(code.r13, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+ };
+}
+
+static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
+ return Optimization::PolyfillOptions{
+ .sha256 = !code.HasHostFeature(HostFeature::SHA),
+ .vector_multiply_widen = true,
+ };
+}
+
+struct Jit::Impl {
+ Impl(Jit* jit, A32::UserConfig conf)
+ : block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this, conf), JitStateInfo{jit_state}, conf.code_cache_size, GenRCP(conf))
+ , emitter(block_of_code, conf, jit)
+ , polyfill_options(GenPolyfillOptions(block_of_code))
+ , conf(std::move(conf))
+ , jit_interface(jit) {}
+
+ ~Impl() = default;
+
+ HaltReason Run() {
+ ASSERT(!jit_interface->is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason)));
+
+ jit_interface->is_executing = true;
+ SCOPE_EXIT {
+ jit_interface->is_executing = false;
+ };
+
+ const CodePtr current_codeptr = [this] {
+ // RSB optimization
+ const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A32JitState::RSBPtrMask;
+ if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) {
+ jit_state.rsb_ptr = new_rsb_ptr;
+ return reinterpret_cast<CodePtr>(jit_state.rsb_codeptrs[new_rsb_ptr]);
+ }
+
+ return GetCurrentBlock();
+ }();
+
+ const HaltReason hr = block_of_code.RunCode(&jit_state, current_codeptr);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ HaltReason Step() {
+ ASSERT(!jit_interface->is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason)));
+
+ jit_interface->is_executing = true;
+ SCOPE_EXIT {
+ jit_interface->is_executing = false;
+ };
+
+ const HaltReason hr = block_of_code.StepCode(&jit_state, GetCurrentSingleStep());
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ void ClearCache() {
+ std::unique_lock lock{invalidation_mutex};
+ invalidate_entire_cache = true;
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void InvalidateCacheRange(std::uint32_t start_address, std::size_t length) {
+ std::unique_lock lock{invalidation_mutex};
+ invalid_cache_ranges.add(boost::icl::discrete_interval<u32>::closed(start_address, static_cast<u32>(start_address + length - 1)));
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void Reset() {
+ ASSERT(!jit_interface->is_executing);
+ jit_state = {};
+ }
+
+ void HaltExecution(HaltReason hr) {
+ Atomic::Or(&jit_state.halt_reason, static_cast<u32>(hr));
+ }
+
+ void ClearHalt(HaltReason hr) {
+ Atomic::And(&jit_state.halt_reason, ~static_cast<u32>(hr));
+ }
+
+ void ClearExclusiveState() {
+ jit_state.exclusive_state = 0;
+ }
+
+ std::array<u32, 16>& Regs() {
+ return jit_state.Reg;
+ }
+
+ const std::array<u32, 16>& Regs() const {
+ return jit_state.Reg;
+ }
+
+ std::array<u32, 64>& ExtRegs() {
+ return jit_state.ExtReg;
+ }
+
+ const std::array<u32, 64>& ExtRegs() const {
+ return jit_state.ExtReg;
+ }
+
+ u32 Cpsr() const {
+ return jit_state.Cpsr();
+ }
+
+ void SetCpsr(u32 value) {
+ return jit_state.SetCpsr(value);
+ }
+
+ u32 Fpscr() const {
+ return jit_state.Fpscr();
+ }
+
+ void SetFpscr(u32 value) {
+ return jit_state.SetFpscr(value);
+ }
+
+ void DumpDisassembly() const {
+ const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
+ Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size);
+ }
+
+ std::vector<std::string> Disassemble() const {
+ const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
+ return Common::DisassembleX64(block_of_code.GetCodeBegin(), size);
+ }
+
+private:
+ static CodePtr GetCurrentBlockThunk(void* this_voidptr) {
+ Jit::Impl& this_ = *static_cast<Jit::Impl*>(this_voidptr);
+ return this_.GetCurrentBlock();
+ }
+
+ IR::LocationDescriptor GetCurrentLocation() const {
+ return IR::LocationDescriptor{jit_state.GetUniqueHash()};
+ }
+
+ CodePtr GetCurrentBlock() {
+ return GetBasicBlock(GetCurrentLocation()).entrypoint;
+ }
+
+ CodePtr GetCurrentSingleStep() {
+ return GetBasicBlock(A32::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true)).entrypoint;
+ }
+
+ A32EmitX64::BlockDescriptor GetBasicBlock(IR::LocationDescriptor descriptor) {
+ auto block = emitter.GetBasicBlock(descriptor);
+ if (block)
+ return *block;
+
+ constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024;
+ if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) {
+ invalidate_entire_cache = true;
+ PerformRequestedCacheInvalidation(HaltReason::CacheInvalidation);
+ }
+ block_of_code.EnsureMemoryCommitted(MINIMUM_REMAINING_CODESIZE);
+
+ IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions});
+ Optimization::PolyfillPass(ir_block, polyfill_options);
+ Optimization::NamingPass(ir_block);
+ if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) {
+ Optimization::A32GetSetElimination(ir_block, {.convert_nz_to_nzc = true});
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::ConstProp)) {
+ Optimization::A32ConstantMemoryReads(ir_block, conf.callbacks);
+ Optimization::ConstantPropagation(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ Optimization::IdentityRemovalPass(ir_block);
+ Optimization::VerificationPass(ir_block);
+ return emitter.Emit(ir_block);
+ }
+
+ void PerformRequestedCacheInvalidation(HaltReason hr) {
+ if (Has(hr, HaltReason::CacheInvalidation)) {
+ std::unique_lock lock{invalidation_mutex};
+
+ ClearHalt(HaltReason::CacheInvalidation);
+
+ if (!invalidate_entire_cache && invalid_cache_ranges.empty()) {
+ return;
+ }
+
+ jit_state.ResetRSB();
+ if (invalidate_entire_cache) {
+ block_of_code.ClearCache();
+ emitter.ClearCache();
+ } else {
+ emitter.InvalidateCacheRanges(invalid_cache_ranges);
+ }
+ invalid_cache_ranges.clear();
+ invalidate_entire_cache = false;
+ }
+ }
+
+ A32JitState jit_state;
+ BlockOfCode block_of_code;
+ A32EmitX64 emitter;
+ Optimization::PolyfillOptions polyfill_options;
+
+ const A32::UserConfig conf;
+
+ Jit* jit_interface;
+
+ // Requests made during execution to invalidate the cache are queued up here.
+ bool invalidate_entire_cache = false;
+ boost::icl::interval_set<u32> invalid_cache_ranges;
+ std::mutex invalidation_mutex;
+};
+
+Jit::Jit(UserConfig conf)
+ : impl(std::make_unique<Impl>(this, std::move(conf))) {}
+
+Jit::~Jit() = default;
+
+HaltReason Jit::Run() {
+ return impl->Run();
+}
+
+HaltReason Jit::Step() {
+ return impl->Step();
+}
+
+void Jit::ClearCache() {
+ impl->ClearCache();
+}
+
+void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) {
+ impl->InvalidateCacheRange(start_address, length);
+}
+
+void Jit::Reset() {
+ impl->Reset();
+}
+
+void Jit::HaltExecution(HaltReason hr) {
+ impl->HaltExecution(hr);
+}
+
+void Jit::ClearHalt(HaltReason hr) {
+ impl->ClearHalt(hr);
+}
+
+std::array<std::uint32_t, 16>& Jit::Regs() {
+ return impl->Regs();
+}
+
+const std::array<std::uint32_t, 16>& Jit::Regs() const {
+ return impl->Regs();
+}
+
+std::array<std::uint32_t, 64>& Jit::ExtRegs() {
+ return impl->ExtRegs();
+}
+
+const std::array<std::uint32_t, 64>& Jit::ExtRegs() const {
+ return impl->ExtRegs();
+}
+
+std::uint32_t Jit::Cpsr() const {
+ return impl->Cpsr();
+}
+
+void Jit::SetCpsr(std::uint32_t value) {
+ impl->SetCpsr(value);
+}
+
+std::uint32_t Jit::Fpscr() const {
+ return impl->Fpscr();
+}
+
+void Jit::SetFpscr(std::uint32_t value) {
+ impl->SetFpscr(value);
+}
+
+void Jit::ClearExclusiveState() {
+ impl->ClearExclusiveState();
+}
+
+void Jit::DumpDisassembly() const {
+ impl->DumpDisassembly();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp
new file mode 100644
index 0000000000..2e201c1cc4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp
@@ -0,0 +1,208 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/a32_jitstate.h"
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/nzcv_util.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+
+namespace Dynarmic::Backend::X64 {
+
+/**
+ * CPSR Bits
+ * =========
+ *
+ * ARM CPSR flags
+ * --------------
+ * N bit 31 Negative flag
+ * Z bit 30 Zero flag
+ * C bit 29 Carry flag
+ * V bit 28 oVerflow flag
+ * Q bit 27 Saturation flag
+ * IT[1:0] bits 25-26 If-Then execution state (lower 2 bits)
+ * J bit 24 Jazelle instruction set flag
+ * GE bits 16-19 Greater than or Equal flags
+ * IT[7:2] bits 10-15 If-Then execution state (upper 6 bits)
+ * E bit 9 Data Endianness flag
+ * A bit 8 Disable imprecise Aborts
+ * I bit 7 Disable IRQ interrupts
+ * F bit 6 Disable FIQ interrupts
+ * T bit 5 Thumb instruction set flag
+ * M bits 0-4 Processor Mode bits
+ *
+ * x64 LAHF+SETO flags
+ * -------------------
+ * SF bit 15 Sign flag
+ * ZF bit 14 Zero flag
+ * AF bit 12 Auxiliary flag
+ * PF bit 10 Parity flag
+ * CF bit 8 Carry flag
+ * OF bit 0 Overflow flag
+ */
+
+u32 A32JitState::Cpsr() const {
+ DEBUG_ASSERT((cpsr_q & ~1) == 0);
+ DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
+
+ u32 cpsr = 0;
+
+ // NZCV flags
+ cpsr |= NZCV::FromX64(cpsr_nzcv);
+ // Q flag
+ cpsr |= cpsr_q ? 1 << 27 : 0;
+ // GE flags
+ cpsr |= mcl::bit::get_bit<31>(cpsr_ge) ? 1 << 19 : 0;
+ cpsr |= mcl::bit::get_bit<23>(cpsr_ge) ? 1 << 18 : 0;
+ cpsr |= mcl::bit::get_bit<15>(cpsr_ge) ? 1 << 17 : 0;
+ cpsr |= mcl::bit::get_bit<7>(cpsr_ge) ? 1 << 16 : 0;
+ // E flag, T flag
+ cpsr |= mcl::bit::get_bit<1>(upper_location_descriptor) ? 1 << 9 : 0;
+ cpsr |= mcl::bit::get_bit<0>(upper_location_descriptor) ? 1 << 5 : 0;
+ // IT state
+ cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000);
+ cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17;
+ // Other flags
+ cpsr |= cpsr_jaifm;
+
+ return cpsr;
+}
+
+void A32JitState::SetCpsr(u32 cpsr) {
+ // NZCV flags
+ cpsr_nzcv = NZCV::ToX64(cpsr);
+ // Q flag
+ cpsr_q = mcl::bit::get_bit<27>(cpsr) ? 1 : 0;
+ // GE flags
+ cpsr_ge = 0;
+ cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0;
+ cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0;
+ cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0;
+ cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0;
+
+ upper_location_descriptor &= 0xFFFF0000;
+ // E flag, T flag
+ upper_location_descriptor |= mcl::bit::get_bit<9>(cpsr) ? 2 : 0;
+ upper_location_descriptor |= mcl::bit::get_bit<5>(cpsr) ? 1 : 0;
+ // IT state
+ upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000;
+ upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000;
+
+ // Other flags
+ cpsr_jaifm = cpsr & 0x010001DF;
+}
+
+void A32JitState::ResetRSB() {
+ rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull);
+ rsb_codeptrs.fill(0);
+}
+
+/**
+ * Comparing MXCSR and FPSCR
+ * =========================
+ *
+ * SSE MXCSR exception flags
+ * -------------------------
+ * PE bit 5 Precision Flag
+ * UE bit 4 Underflow Flag
+ * OE bit 3 Overflow Flag
+ * ZE bit 2 Divide By Zero Flag
+ * DE bit 1 Denormal Flag // Appears to only be set when MXCSR.DAZ = 0
+ * IE bit 0 Invalid Operation Flag
+ *
+ * VFP FPSCR cumulative exception bits
+ * -----------------------------------
+ * IDC bit 7 Input Denormal cumulative exception bit // Only ever set when FPSCR.FTZ = 1
+ * IXC bit 4 Inexact cumulative exception bit
+ * UFC bit 3 Underflow cumulative exception bit
+ * OFC bit 2 Overflow cumulative exception bit
+ * DZC bit 1 Division by Zero cumulative exception bit
+ * IOC bit 0 Invalid Operation cumulative exception bit
+ *
+ * SSE MSCSR exception masks
+ * -------------------------
+ * PM bit 12 Precision Mask
+ * UM bit 11 Underflow Mask
+ * OM bit 10 Overflow Mask
+ * ZM bit 9 Divide By Zero Mask
+ * DM bit 8 Denormal Mask
+ * IM bit 7 Invalid Operation Mask
+ *
+ * VFP FPSCR exception trap enables
+ * --------------------------------
+ * IDE bit 15 Input Denormal exception trap enable
+ * IXE bit 12 Inexact exception trap enable
+ * UFE bit 11 Underflow exception trap enable
+ * OFE bit 10 Overflow exception trap enable
+ * DZE bit 9 Division by Zero exception trap enable
+ * IOE bit 8 Invalid Operation exception trap enable
+ *
+ * SSE MXCSR mode bits
+ * -------------------
+ * FZ bit 15 Flush To Zero
+ * DAZ bit 6 Denormals Are Zero
+ * RN bits 13-14 Round to {0 = Nearest, 1 = Negative, 2 = Positive, 3 = Zero}
+ *
+ * VFP FPSCR mode bits
+ * -------------------
+ * AHP bit 26 Alternate half-precision
+ * DN bit 25 Default NaN
+ * FZ bit 24 Flush to Zero
+ * RMode bits 22-23 Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero}
+ * Stride bits 20-21 Vector stride
+ * Len bits 16-18 Vector length
+ */
+
+// NZCV; QC (ASIMD only), AHP; DN, FZ, RMode, Stride; SBZP; Len; trap enables; cumulative bits
+constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK;
+constexpr u32 FPSCR_NZCV_MASK = 0xF0000000;
+
+u32 A32JitState::Fpscr() const {
+ DEBUG_ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0);
+
+ const u32 fpcr_mode = static_cast<u32>(upper_location_descriptor) & FPSCR_MODE_MASK;
+ const u32 mxcsr = guest_MXCSR | asimd_MXCSR;
+
+ u32 FPSCR = fpcr_mode | fpsr_nzcv;
+ FPSCR |= (mxcsr & 0b0000000000001); // IOC = IE
+ FPSCR |= (mxcsr & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE
+ FPSCR |= fpsr_exc;
+ FPSCR |= fpsr_qc != 0 ? 1 << 27 : 0;
+
+ return FPSCR;
+}
+
+void A32JitState::SetFpscr(u32 FPSCR) {
+ // Ensure that only upper half of upper_location_descriptor is used for FPSCR bits.
+ static_assert((FPSCR_MODE_MASK & 0xFFFF0000) == FPSCR_MODE_MASK);
+
+ upper_location_descriptor &= 0x0000FFFF;
+ upper_location_descriptor |= FPSCR & FPSCR_MODE_MASK;
+
+ fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK;
+ fpsr_qc = (FPSCR >> 27) & 1;
+
+ guest_MXCSR = 0x00001f80;
+ asimd_MXCSR = 0x00009fc0;
+
+ // RMode
+ const std::array<u32, 4> MXCSR_RMode{0x0, 0x4000, 0x2000, 0x6000};
+ guest_MXCSR |= MXCSR_RMode[(FPSCR >> 22) & 0x3];
+
+ // Cumulative flags IDC, IOC, IXC, UFC, OFC, DZC
+ fpsr_exc = FPSCR & 0x9F;
+
+ if (mcl::bit::get_bit<24>(FPSCR)) {
+ // VFP Flush to Zero
+ guest_MXCSR |= (1 << 15); // SSE Flush to Zero
+ guest_MXCSR |= (1 << 6); // SSE Denormals are Zero
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h
new file mode 100644
index 0000000000..cc13abf8e2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h
@@ -0,0 +1,97 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::X64 {
+
+class BlockOfCode;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+struct A32JitState {
+ using ProgramCounterType = u32;
+
+ A32JitState() { ResetRSB(); }
+
+ std::array<u32, 16> Reg{}; // Current register file.
+ // TODO: Mode-specific register sets unimplemented.
+
+ u32 upper_location_descriptor = 0;
+
+ u32 cpsr_ge = 0;
+ u32 cpsr_q = 0;
+ u32 cpsr_nzcv = 0;
+ u32 cpsr_jaifm = 0;
+ u32 Cpsr() const;
+ void SetCpsr(u32 cpsr);
+
+ alignas(16) std::array<u32, 64> ExtReg{}; // Extension registers.
+
+ // For internal use (See: BlockOfCode::RunCode)
+ u32 guest_MXCSR = 0x00001f80;
+ u32 asimd_MXCSR = 0x00009fc0;
+ volatile u32 halt_reason = 0;
+
+ // Exclusive state
+ u32 exclusive_state = 0;
+
+ static constexpr size_t RSBSize = 8; // MUST be a power of 2.
+ static constexpr size_t RSBPtrMask = RSBSize - 1;
+ u32 rsb_ptr = 0;
+ std::array<u64, RSBSize> rsb_location_descriptors;
+ std::array<u64, RSBSize> rsb_codeptrs;
+ void ResetRSB();
+
+ u32 fpsr_exc = 0;
+ u32 fpsr_qc = 0;
+ u32 fpsr_nzcv = 0;
+ u32 Fpscr() const;
+ void SetFpscr(u32 FPSCR);
+
+ u64 GetUniqueHash() const noexcept {
+ return (static_cast<u64>(upper_location_descriptor) << 32) | (static_cast<u64>(Reg[15]));
+ }
+
+ void TransferJitState(const A32JitState& src, bool reset_rsb) {
+ Reg = src.Reg;
+ upper_location_descriptor = src.upper_location_descriptor;
+ cpsr_ge = src.cpsr_ge;
+ cpsr_q = src.cpsr_q;
+ cpsr_nzcv = src.cpsr_nzcv;
+ cpsr_jaifm = src.cpsr_jaifm;
+ ExtReg = src.ExtReg;
+ guest_MXCSR = src.guest_MXCSR;
+ asimd_MXCSR = src.asimd_MXCSR;
+ fpsr_exc = src.fpsr_exc;
+ fpsr_qc = src.fpsr_qc;
+ fpsr_nzcv = src.fpsr_nzcv;
+
+ exclusive_state = 0;
+
+ if (reset_rsb) {
+ ResetRSB();
+ } else {
+ rsb_ptr = src.rsb_ptr;
+ rsb_location_descriptors = src.rsb_location_descriptors;
+ rsb_codeptrs = src.rsb_codeptrs;
+ }
+ }
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+using CodePtr = const void*;
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
new file mode 100644
index 0000000000..cec781380b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
@@ -0,0 +1,763 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/a64_emit_x64.h"
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/assert.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/integer_of_size.hpp>
+
+#include "dynarmic/backend/x64/a64_jitstate.h"
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/backend/x64/nzcv_util.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/a64_types.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/cond.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
+// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
+ : EmitContext(reg_alloc, block), conf(conf) {}
+
+A64::LocationDescriptor A64EmitContext::Location() const {
+ return A64::LocationDescriptor{block.Location()};
+}
+
+bool A64EmitContext::IsSingleStep() const {
+ return Location().SingleStepping();
+}
+
+FP::FPCR A64EmitContext::FPCR(bool fpcr_controlled) const {
+ return fpcr_controlled ? Location().FPCR() : Location().FPCR().ASIMDStandardValue();
+}
+
+A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface)
+ : EmitX64(code), conf(conf), jit_interface{jit_interface} {
+ GenMemory128Accessors();
+ GenFastmemFallbacks();
+ GenTerminalHandlers();
+ code.PreludeComplete();
+ ClearFastDispatchTable();
+
+ exception_handler.SetFastmemCallback([this](u64 rip_) {
+ return FastmemCallback(rip_);
+ });
+}
+
+A64EmitX64::~A64EmitX64() = default;
+
+A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) {
+ if (conf.very_verbose_debugging_output) {
+ std::puts(IR::DumpBlock(block).c_str());
+ }
+
+ code.EnableWriting();
+ SCOPE_EXIT {
+ code.DisableWriting();
+ };
+
+ const std::vector<HostLoc> gpr_order = [this] {
+ std::vector<HostLoc> gprs{any_gpr};
+ if (conf.page_table) {
+ gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
+ }
+ if (conf.fastmem_pointer) {
+ gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
+ }
+ return gprs;
+ }();
+
+ RegAlloc reg_alloc{code, gpr_order, any_xmm};
+ A64EmitContext ctx{conf, reg_alloc, block};
+
+ // Start emitting.
+ code.align();
+ const u8* const entrypoint = code.getCurr();
+
+ ASSERT(block.GetCondition() == IR::Cond::AL);
+
+ for (auto iter = block.begin(); iter != block.end(); ++iter) {
+ IR::Inst* inst = &*iter;
+
+ // Call the relevant Emit* member function.
+ switch (inst->GetOpcode()) {
+#define OPCODE(name, type, ...) \
+ case IR::Opcode::name: \
+ A64EmitX64::Emit##name(ctx, inst); \
+ break;
+#define A32OPC(...)
+#define A64OPC(name, type, ...) \
+ case IR::Opcode::A64##name: \
+ A64EmitX64::EmitA64##name(ctx, inst); \
+ break;
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+ default:
+ ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode());
+ break;
+ }
+
+ ctx.reg_alloc.EndOfAllocScope();
+
+ if (conf.very_verbose_debugging_output) {
+ EmitVerboseDebuggingOutput(reg_alloc);
+ }
+ }
+
+ reg_alloc.AssertNoMoreUses();
+
+ if (conf.enable_cycle_counting) {
+ EmitAddCycles(block.CycleCount());
+ }
+ EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
+ code.int3();
+
+ for (auto& deferred_emit : ctx.deferred_emits) {
+ deferred_emit();
+ }
+ code.int3();
+
+ const size_t size = static_cast<size_t>(code.getCurr() - entrypoint);
+
+ const A64::LocationDescriptor descriptor{block.Location()};
+ const A64::LocationDescriptor end_location{block.EndLocation()};
+
+ const auto range = boost::icl::discrete_interval<u64>::closed(descriptor.PC(), end_location.PC() - 1);
+ block_ranges.AddRange(range, descriptor);
+
+ return RegisterBlock(descriptor, entrypoint, size);
+}
+
+void A64EmitX64::ClearCache() {
+ EmitX64::ClearCache();
+ block_ranges.ClearCache();
+ ClearFastDispatchTable();
+ fastmem_patch_info.clear();
+}
+
+void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) {
+ InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
+}
+
+void A64EmitX64::ClearFastDispatchTable() {
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ fast_dispatch_table.fill({});
+ }
+}
+
+void A64EmitX64::GenTerminalHandlers() {
+ // PC ends up in rbp, location_descriptor ends up in rbx
+ const auto calculate_location_descriptor = [this] {
+ // This calculation has to match up with A64::LocationDescriptor::UniqueHash
+ // TODO: Optimization is available here based on known state of fpcr.
+ code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]);
+ code.mov(rcx, A64::LocationDescriptor::pc_mask);
+ code.and_(rcx, rbp);
+ code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]);
+ code.and_(ebx, A64::LocationDescriptor::fpcr_mask);
+ code.shl(rbx, A64::LocationDescriptor::fpcr_shift);
+ code.or_(rbx, rcx);
+ };
+
+ Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
+
+ code.align();
+ terminal_handler_pop_rsb_hint = code.getCurr<const void*>();
+ calculate_location_descriptor();
+ code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]);
+ code.sub(eax, 1);
+ code.and_(eax, u32(A64JitState::RSBPtrMask));
+ code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax);
+ code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]);
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.jne(rsb_cache_miss);
+ } else {
+ code.jne(code.GetReturnFromRunCodeAddress());
+ }
+ code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]);
+ code.jmp(rax);
+ PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint");
+
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.align();
+ terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
+ calculate_location_descriptor();
+ code.L(rsb_cache_miss);
+ code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
+ code.mov(rbp, rbx);
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ code.crc32(rbp, r12);
+ }
+ code.and_(ebp, fast_dispatch_table_mask);
+ code.lea(rbp, ptr[r12 + rbp]);
+ code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
+ code.jne(fast_dispatch_cache_miss);
+ code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]);
+ code.L(fast_dispatch_cache_miss);
+ code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx);
+ code.LookupBlock();
+ code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax);
+ code.jmp(rax);
+ PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint");
+
+ code.align();
+ fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry& (*)(u64)>();
+ code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
+ }
+ code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask);
+ code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM2 + code.ABI_PARAM1]);
+ code.ret();
+ PerfMapRegister(fast_dispatch_table_lookup, code.getCurr(), "a64_fast_dispatch_table_lookup");
+ }
+}
+
+void A64EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
+ if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) {
+ return;
+ }
+
+ EmitX64::EmitPushRSB(ctx, inst);
+}
+
+void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8();
+ code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store);
+}
+
+void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
+ code.shr(result, NZCV::x64_c_flag_bit);
+ code.and_(result, 1);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
+
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(tmp, NZCV::x64_mask);
+ code.pext(nzcv_raw, nzcv_raw, tmp);
+ code.shl(nzcv_raw, 28);
+ } else {
+ code.and_(nzcv_raw, NZCV::x64_mask);
+ code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
+ code.and_(nzcv_raw, NZCV::arm_mask);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, nzcv_raw);
+}
+
+void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.shr(nzcv_raw, 28);
+ if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(tmp, NZCV::x64_mask);
+ code.pdep(nzcv_raw, nzcv_raw, tmp);
+ } else {
+ code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
+ code.and_(nzcv_raw, NZCV::x64_mask);
+ }
+ code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
+}
+
+void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store);
+}
+
+void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.mov(result, dword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) {
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(result, qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movd(result, addr);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movq(result, addr);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movaps(result, addr);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ code.mov(result, qword[r15 + offsetof(A64JitState, sp)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, dword[r15 + offsetof(A64JitState, fpcr)]);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+static u32 GetFPSRImpl(A64JitState* jit_state) {
+ return jit_state->GetFpsr();
+}
+
+void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.HostCall(inst);
+ code.mov(code.ABI_PARAM1, code.r15);
+ code.stmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
+ code.CallFunction(GetFPSRImpl);
+}
+
+void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+ const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
+ if (args[1].FitsInImmediateS32()) {
+ code.mov(addr, args[1].GetImmediateS32());
+ } else {
+ // TODO: zext tracking, xmm variant
+ const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(args[1]);
+ code.mov(to_store.cvt32(), to_store.cvt32());
+ code.mov(addr, to_store);
+ }
+}
+
+void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Reg reg = inst->GetArg(0).GetA64RegRef();
+ const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)];
+ if (args[1].FitsInImmediateS32()) {
+ code.mov(addr, args[1].GetImmediateS32());
+ } else if (args[1].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ code.movq(addr, to_store);
+ } else {
+ const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]);
+ code.mov(addr, to_store);
+ }
+}
+
+void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ // TODO: Optimize
+ code.pxor(tmp, tmp);
+ code.movss(tmp, to_store);
+ code.movaps(addr, tmp);
+}
+
+void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]);
+ code.movq(to_store, to_store); // TODO: Remove when able
+ code.movaps(addr, to_store);
+}
+
+void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const A64::Vec vec = inst->GetArg(0).GetA64VecRef();
+ const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];
+
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]);
+ code.movaps(addr, to_store);
+}
+
+void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto addr = qword[r15 + offsetof(A64JitState, sp)];
+ if (args[0].FitsInImmediateS32()) {
+ code.mov(addr, args[0].GetImmediateS32());
+ } else if (args[0].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
+ code.movq(addr, to_store);
+ } else {
+ const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
+ code.mov(addr, to_store);
+ }
+}
+
+static void SetFPCRImpl(A64JitState* jit_state, u32 value) {
+ jit_state->SetFpcr(value);
+}
+
+void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
+ code.mov(code.ABI_PARAM1, code.r15);
+ code.CallFunction(SetFPCRImpl);
+ code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
+}
+
+static void SetFPSRImpl(A64JitState* jit_state, u32 value) {
+ jit_state->SetFpsr(value);
+}
+
+void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
+ code.mov(code.ABI_PARAM1, code.r15);
+ code.CallFunction(SetFPSRImpl);
+ code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]);
+}
+
+void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto addr = qword[r15 + offsetof(A64JitState, pc)];
+ if (args[0].FitsInImmediateS32()) {
+ code.mov(addr, args[0].GetImmediateS32());
+ } else if (args[0].IsInXmm()) {
+ const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]);
+ code.movq(addr, to_store);
+ } else {
+ const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]);
+ code.mov(addr, to_store);
+ }
+}
+
+void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.HostCall(nullptr);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[0].IsImmediate());
+ const u32 imm = args[0].GetImmediateU32();
+ Devirtualize<&A64::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code, [&](RegList param) {
+ code.mov(param[0], imm);
+ });
+ // The kernel would have to execute ERET to get here, which would clear exclusive state.
+ code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
+}
+
+void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.HostCall(nullptr);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[0].IsImmediate() && args[1].IsImmediate());
+ const u64 pc = args[0].GetImmediateU64();
+ const u64 exception = args[1].GetImmediateU64();
+ Devirtualize<&A64::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code, [&](RegList param) {
+ code.mov(param[0], pc);
+ code.mov(param[1], exception);
+ });
+}
+
+void A64EmitX64::EmitA64DataCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]);
+ Devirtualize<&A64::UserCallbacks::DataCacheOperationRaised>(conf.callbacks).EmitCall(code);
+}
+
+void A64EmitX64::EmitA64InstructionCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
+ Devirtualize<&A64::UserCallbacks::InstructionCacheOperationRaised>(conf.callbacks).EmitCall(code);
+}
+
+void A64EmitX64::EmitA64DataSynchronizationBarrier(A64EmitContext&, IR::Inst*) {
+ code.mfence();
+ code.lfence();
+}
+
+void A64EmitX64::EmitA64DataMemoryBarrier(A64EmitContext&, IR::Inst*) {
+ code.mfence();
+}
+
+void A64EmitX64::EmitA64InstructionSynchronizationBarrier(A64EmitContext& ctx, IR::Inst*) {
+ if (!conf.hook_isb) {
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(nullptr);
+ Devirtualize<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code);
+}
+
+void A64EmitX64::EmitA64GetCNTFRQ(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, conf.cntfrq_el0);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.HostCall(inst);
+ if (!conf.wall_clock_cntpct) {
+ code.UpdateTicks();
+ }
+ Devirtualize<&A64::UserCallbacks::GetCNTPCT>(conf.callbacks).EmitCall(code);
+}
+
+void A64EmitX64::EmitA64GetCTR(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, conf.ctr_el0);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetDCZID(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, conf.dczid_el0);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ if (conf.tpidr_el0) {
+ code.mov(result, u64(conf.tpidr_el0));
+ code.mov(result, qword[result]);
+ } else {
+ code.xor_(result.cvt32(), result.cvt32());
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64GetTPIDRRO(A64EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ if (conf.tpidrro_el0) {
+ code.mov(result, u64(conf.tpidrro_el0));
+ code.mov(result, qword[result]);
+ } else {
+ code.xor_(result.cvt32(), result.cvt32());
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr();
+ if (conf.tpidr_el0) {
+ code.mov(addr, u64(conf.tpidr_el0));
+ code.mov(qword[addr], value);
+ }
+}
+
+std::string A64EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const {
+ const A64::LocationDescriptor descriptor{ir_descriptor};
+ return fmt::format("a64_{:016X}_fpcr{:08X}",
+ descriptor.PC(),
+ descriptor.FPCR().Value());
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor, bool) {
+ code.SwitchMxcsrOnExit();
+ Devirtualize<&A64::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code, [&](RegList param) {
+ code.mov(param[0], A64::LocationDescriptor{terminal.next}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], param[0]);
+ code.mov(param[1].cvt32(), terminal.num_instructions);
+ });
+ code.ReturnFromRunCode(true); // TODO: Check cycles
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
+ code.ReturnFromRunCode();
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
+ code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ if (conf.enable_cycle_counting) {
+ code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
+
+ patch_information[terminal.next].jg.push_back(code.getCurr());
+ if (const auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJg(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJg(terminal.next);
+ }
+ } else {
+ code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0);
+
+ patch_information[terminal.next].jz.push_back(code.getCurr());
+ if (const auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJz(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJz(terminal.next);
+ }
+ }
+
+ code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.ForceReturnFromRunCode();
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) {
+ code.mov(rax, A64::LocationDescriptor{terminal.next}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ patch_information[terminal.next].jmp.push_back(code.getCurr());
+ if (auto next_bb = GetBasicBlock(terminal.next)) {
+ EmitPatchJmp(terminal.next, next_bb->entrypoint);
+ } else {
+ EmitPatchJmp(terminal.next);
+ }
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) {
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ code.jmp(terminal_handler_pop_rsb_hint);
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) {
+ if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) {
+ code.ReturnFromRunCode();
+ return;
+ }
+
+ code.jmp(terminal_handler_fast_dispatch_hint);
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ switch (terminal.if_) {
+ case IR::Cond::AL:
+ case IR::Cond::NV:
+ EmitTerminal(terminal.then_, initial_location, is_single_step);
+ break;
+ default:
+ Xbyak::Label pass = EmitCond(terminal.if_);
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+ code.L(pass);
+ EmitTerminal(terminal.then_, initial_location, is_single_step);
+ break;
+ }
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ Xbyak::Label fail;
+ code.cmp(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], u8(0));
+ code.jz(fail);
+ EmitTerminal(terminal.then_, initial_location, is_single_step);
+ code.L(fail);
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A64EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0);
+ code.jne(code.GetForceReturnFromRunCodeAddress());
+ EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A64EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jg(target_code_ptr);
+ } else {
+ code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.jg(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 23);
+}
+
+void A64EmitX64::EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jz(target_code_ptr);
+ } else {
+ code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.jz(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 23);
+}
+
+void A64EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr patch_location = code.getCurr();
+ if (target_code_ptr) {
+ code.jmp(target_code_ptr);
+ } else {
+ code.mov(rax, A64::LocationDescriptor{target_desc}.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.jmp(code.GetReturnFromRunCodeAddress());
+ }
+ code.EnsurePatchLocationSize(patch_location, 22);
+}
+
+void A64EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) {
+ if (!target_code_ptr) {
+ target_code_ptr = code.GetReturnFromRunCodeAddress();
+ }
+ const CodePtr patch_location = code.getCurr();
+ code.mov(code.rcx, reinterpret_cast<u64>(target_code_ptr));
+ code.EnsurePatchLocationSize(patch_location, 10);
+}
+
+void A64EmitX64::Unpatch(const IR::LocationDescriptor& location) {
+ EmitX64::Unpatch(location);
+ if (conf.HasOptimization(OptimizationFlag::FastDispatch)) {
+ code.DisableWriting();
+ (*fast_dispatch_table_lookup)(location.Value()) = {};
+ code.EnableWriting();
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
new file mode 100644
index 0000000000..ed49315512
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
@@ -0,0 +1,144 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <map>
+#include <optional>
+#include <tuple>
+
+#include "dynarmic/backend/block_range_information.h"
+#include "dynarmic/backend/x64/a64_jitstate.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/interface/A64/a64.h"
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/ir/terminal.h"
+
+namespace Dynarmic::Backend::X64 {
+
+class RegAlloc;
+
+struct A64EmitContext final : public EmitContext {
+ A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+
+ A64::LocationDescriptor Location() const;
+ bool IsSingleStep() const;
+ FP::FPCR FPCR(bool fpcr_controlled = true) const override;
+
+ bool HasOptimization(OptimizationFlag flag) const override {
+ return conf.HasOptimization(flag);
+ }
+
+ const A64::UserConfig& conf;
+};
+
+class A64EmitX64 final : public EmitX64 {
+public:
+ A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface);
+ ~A64EmitX64() override;
+
+ /**
+ * Emit host machine code for a basic block with intermediate representation `block`.
+ * @note block is modified.
+ */
+ BlockDescriptor Emit(IR::Block& block);
+
+ void ClearCache() override;
+
+ void InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges);
+
+protected:
+ const A64::UserConfig conf;
+ A64::Jit* jit_interface;
+ BlockRangeInformation<u64> block_ranges;
+
+ struct FastDispatchEntry {
+ u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull;
+ const void* code_ptr = nullptr;
+ };
+ static_assert(sizeof(FastDispatchEntry) == 0x10);
+ static constexpr u64 fast_dispatch_table_mask = 0xFFFFF0;
+ static constexpr size_t fast_dispatch_table_size = 0x100000;
+ std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
+ void ClearFastDispatchTable();
+
+ void (*memory_read_128)();
+ void (*memory_write_128)();
+ void (*memory_exclusive_write_128)();
+ void GenMemory128Accessors();
+
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks;
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
+ std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
+ void GenFastmemFallbacks();
+
+ const void* terminal_handler_pop_rsb_hint;
+ const void* terminal_handler_fast_dispatch_hint = nullptr;
+ FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;
+ void GenTerminalHandlers();
+
+ // Microinstruction emitters
+ void EmitPushRSB(EmitContext& ctx, IR::Inst* inst);
+#define OPCODE(...)
+#define A32OPC(...)
+#define A64OPC(name, type, ...) void EmitA64##name(A64EmitContext& ctx, IR::Inst* inst);
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+ // Helpers
+ std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override;
+
+ // Fastmem information
+ using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>;
+ struct FastmemPatchInfo {
+ u64 resume_rip;
+ u64 callback;
+ DoNotFastmemMarker marker;
+ bool recompile;
+ };
+ tsl::robin_map<u64, FastmemPatchInfo> fastmem_patch_info;
+ std::set<DoNotFastmemMarker> do_not_fastmem;
+ std::optional<DoNotFastmemMarker> ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const;
+ FakeCall FastmemCallback(u64 rip);
+
+ // Memory access helpers
+ void EmitCheckMemoryAbort(A64EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end = nullptr);
+ template<std::size_t bitsize, auto callback>
+ void EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst);
+ template<std::size_t bitsize, auto callback>
+ void EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst);
+
+ // Terminal instruction emitters
+ void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+ void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+
+ // Patching
+ void Unpatch(const IR::LocationDescriptor& target_desc) override;
+ void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+ void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) override;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp
new file mode 100644
index 0000000000..450b16d000
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp
@@ -0,0 +1,431 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+#include <initializer_list>
+#include <tuple>
+#include <utility>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/a64_emit_x64.h"
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/emit_x64_memory.h"
+#include "dynarmic/backend/x64/exclusive_monitor_friend.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/common/spin_lock_x64.h"
+#include "dynarmic/common/x64_disassemble.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void A64EmitX64::GenMemory128Accessors() {
+ code.align();
+ memory_read_128 = code.getCurr<void (*)()>();
+#ifdef _WIN32
+ Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code, [&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) {
+ code.mov(code.ABI_PARAM3, code.ABI_PARAM2);
+ code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE);
+ code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]);
+ });
+ code.movups(xmm1, xword[code.ABI_RETURN]);
+ code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
+#else
+ code.sub(rsp, 8);
+ Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(xmm1, code.ABI_RETURN);
+ code.pinsrq(xmm1, code.ABI_RETURN2, 1);
+ } else {
+ code.movq(xmm1, code.ABI_RETURN);
+ code.movq(xmm2, code.ABI_RETURN2);
+ code.punpcklqdq(xmm1, xmm2);
+ }
+ code.add(rsp, 8);
+#endif
+ code.ret();
+ PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128");
+
+ code.align();
+ memory_write_128 = code.getCurr<void (*)()>();
+#ifdef _WIN32
+ code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.movaps(xword[code.ABI_PARAM3], xmm1);
+ Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code);
+ code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE);
+#else
+ code.sub(rsp, 8);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(code.ABI_PARAM3, xmm1);
+ code.pextrq(code.ABI_PARAM4, xmm1, 1);
+ } else {
+ code.movq(code.ABI_PARAM3, xmm1);
+ code.punpckhqdq(xmm1, xmm1);
+ code.movq(code.ABI_PARAM4, xmm1);
+ }
+ Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code);
+ code.add(rsp, 8);
+#endif
+ code.ret();
+ PerfMapRegister(memory_write_128, code.getCurr(), "a64_memory_write_128");
+
+ code.align();
+ memory_exclusive_write_128 = code.getCurr<void (*)()>();
+#ifdef _WIN32
+ code.sub(rsp, 8 + 32 + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 16]);
+ code.movaps(xword[code.ABI_PARAM3], xmm1);
+ code.movaps(xword[code.ABI_PARAM4], xmm2);
+ Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code);
+ code.add(rsp, 8 + 32 + ABI_SHADOW_SPACE);
+#else
+ code.sub(rsp, 8);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(code.ABI_PARAM3, xmm1);
+ code.pextrq(code.ABI_PARAM4, xmm1, 1);
+ code.movq(code.ABI_PARAM5, xmm2);
+ code.pextrq(code.ABI_PARAM6, xmm2, 1);
+ } else {
+ code.movq(code.ABI_PARAM3, xmm1);
+ code.punpckhqdq(xmm1, xmm1);
+ code.movq(code.ABI_PARAM4, xmm1);
+ code.movq(code.ABI_PARAM5, xmm2);
+ code.punpckhqdq(xmm2, xmm2);
+ code.movq(code.ABI_PARAM6, xmm2);
+ }
+ Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code);
+ code.add(rsp, 8);
+#endif
+ code.ret();
+ PerfMapRegister(memory_exclusive_write_128, code.getCurr(), "a64_memory_exclusive_write_128");
+}
+
+void A64EmitX64::GenFastmemFallbacks() {
+ const std::initializer_list<int> idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ const std::array<std::pair<size_t, ArgCallback>, 4> read_callbacks{{
+ {8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)},
+ {16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)},
+ {32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)},
+ {64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)},
+ }};
+ const std::array<std::pair<size_t, ArgCallback>, 4> write_callbacks{{
+ {8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)},
+ {16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)},
+ {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)},
+ {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)},
+ }};
+ const std::array<std::pair<size_t, ArgCallback>, 4> exclusive_write_callbacks{{
+ {8, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)},
+ {16, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)},
+ {32, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)},
+ {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)},
+ }};
+
+ for (bool ordered : {false, true}) {
+ for (int vaddr_idx : idxes) {
+ if (vaddr_idx == 4 || vaddr_idx == 15) {
+ continue;
+ }
+
+ for (int value_idx : idxes) {
+ code.align();
+ read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx));
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ if (ordered) {
+ code.mfence();
+ }
+ code.call(memory_read_128);
+ if (value_idx != 1) {
+ code.movaps(Xbyak::Xmm{value_idx}, xmm1);
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx));
+ code.ret();
+ PerfMapRegister(read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128");
+
+ code.align();
+ write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStack(code);
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ if (value_idx != 1) {
+ code.movaps(xmm1, Xbyak::Xmm{value_idx});
+ }
+ code.call(memory_write_128);
+ if (ordered) {
+ code.mfence();
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStack(code);
+ code.ret();
+ PerfMapRegister(write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128");
+
+ code.align();
+ exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ if (value_idx != 1) {
+ code.movaps(xmm1, Xbyak::Xmm{value_idx});
+ }
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(xmm2, rax);
+ code.pinsrq(xmm2, rdx, 1);
+ } else {
+ code.movq(xmm2, rax);
+ code.movq(xmm0, rdx);
+ code.punpcklqdq(xmm2, xmm0);
+ }
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ code.call(memory_exclusive_write_128);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ code.ret();
+ PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_exclusive_write_fallback_128");
+
+ if (value_idx == 4 || value_idx == 15) {
+ continue;
+ }
+
+ for (const auto& [bitsize, callback] : read_callbacks) {
+ code.align();
+ read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ if (ordered) {
+ code.mfence();
+ }
+ callback.EmitCall(code);
+ if (value_idx != code.ABI_RETURN.getIdx()) {
+ code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN);
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx));
+ code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx});
+ code.ret();
+ PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize));
+ }
+
+ for (const auto& [bitsize, callback] : write_callbacks) {
+ code.align();
+ write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStack(code);
+ if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) {
+ code.xchg(code.ABI_PARAM2, code.ABI_PARAM3);
+ } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ } else {
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ }
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM3);
+ callback.EmitCall(code);
+ if (ordered) {
+ code.mfence();
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStack(code);
+ code.ret();
+ PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize));
+ }
+
+ for (const auto& [bitsize, callback] : exclusive_write_callbacks) {
+ code.align();
+ exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>();
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) {
+ code.xchg(code.ABI_PARAM2, code.ABI_PARAM3);
+ } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ } else {
+ if (value_idx != code.ABI_PARAM3.getIdx()) {
+ code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx});
+ }
+ if (vaddr_idx != code.ABI_PARAM2.getIdx()) {
+ code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx});
+ }
+ }
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM3);
+ code.mov(code.ABI_PARAM4, rax);
+ code.ZeroExtendFrom(bitsize, code.ABI_PARAM4);
+ callback.EmitCall(code);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX);
+ code.ret();
+ PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize));
+ }
+ }
+ }
+ }
+}
+
+#define Axx A64
+#include "dynarmic/backend/x64/emit_x64_memory.cpp.inc"
+#undef Axx
+
+void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryRead<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<8, &A64::UserCallbacks::MemoryWrite8>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<16, &A64::UserCallbacks::MemoryWrite16>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<32, &A64::UserCallbacks::MemoryWrite32>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<64, &A64::UserCallbacks::MemoryWrite64>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) {
+ EmitMemoryWrite<128, &A64::UserCallbacks::MemoryWrite64>(ctx, inst);
+}
+
+void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) {
+ code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0));
+}
+
+void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveReadMemoryInline<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst);
+ } else {
+ EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) {
+ if (conf.fastmem_exclusive_access) {
+ EmitExclusiveWriteMemoryInline<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst);
+ } else {
+ EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst);
+ }
+}
+
+void A64EmitX64::EmitCheckMemoryAbort(A64EmitContext&, IR::Inst* inst, Xbyak::Label* end) {
+ if (!conf.check_halt_on_memory_access) {
+ return;
+ }
+
+ Xbyak::Label skip;
+
+ const A64::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}};
+
+ code.test(dword[r15 + offsetof(A64JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort));
+ if (end) {
+ code.jz(*end, code.T_NEAR);
+ } else {
+ code.jz(skip, code.T_NEAR);
+ }
+ code.mov(rax, current_location.PC());
+ code.mov(qword[r15 + offsetof(A64JitState, pc)], rax);
+ code.ForceReturnFromRunCode();
+ code.L(skip);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
new file mode 100644
index 0000000000..57f75df003
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
@@ -0,0 +1,449 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstring>
+#include <memory>
+#include <mutex>
+
+#include <boost/icl/interval_set.hpp>
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/scope_exit.hpp>
+
+#include "dynarmic/backend/x64/a64_emit_x64.h"
+#include "dynarmic/backend/x64/a64_jitstate.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/devirtualize.h"
+#include "dynarmic/backend/x64/jitstate_info.h"
+#include "dynarmic/common/atomic.h"
+#include "dynarmic/common/x64_disassemble.h"
+#include "dynarmic/frontend/A64/translate/a64_translate.h"
+#include "dynarmic/interface/A64/a64.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::A64 {
+
+using namespace Backend::X64;
+
+static RunCodeCallbacks GenRunCodeCallbacks(A64::UserCallbacks* cb, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg, const A64::UserConfig& conf) {
+ return RunCodeCallbacks{
+ std::make_unique<ArgCallback>(LookupBlock, reinterpret_cast<u64>(arg)),
+ std::make_unique<ArgCallback>(Devirtualize<&A64::UserCallbacks::AddTicks>(cb)),
+ std::make_unique<ArgCallback>(Devirtualize<&A64::UserCallbacks::GetTicksRemaining>(cb)),
+ conf.enable_cycle_counting,
+ };
+}
+
+static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) {
+ return [conf](BlockOfCode& code) {
+ if (conf.page_table) {
+ code.mov(code.r14, mcl::bit_cast<u64>(conf.page_table));
+ }
+ if (conf.fastmem_pointer) {
+ code.mov(code.r13, mcl::bit_cast<u64>(conf.fastmem_pointer));
+ }
+ };
+}
+
+static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
+ return Optimization::PolyfillOptions{
+ .sha256 = !code.HasHostFeature(HostFeature::SHA),
+ .vector_multiply_widen = true,
+ };
+}
+
+struct Jit::Impl final {
+public:
+ Impl(Jit* jit, UserConfig conf)
+ : conf(conf)
+ , block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this, conf), JitStateInfo{jit_state}, conf.code_cache_size, GenRCP(conf))
+ , emitter(block_of_code, conf, jit)
+ , polyfill_options(GenPolyfillOptions(block_of_code)) {
+ ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64);
+ }
+
+ ~Impl() = default;
+
+ HaltReason Run() {
+ ASSERT(!is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason)));
+
+ is_executing = true;
+ SCOPE_EXIT {
+ this->is_executing = false;
+ };
+
+ // TODO: Check code alignment
+
+ const CodePtr current_code_ptr = [this] {
+ // RSB optimization
+ const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A64JitState::RSBPtrMask;
+ if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) {
+ jit_state.rsb_ptr = new_rsb_ptr;
+ return reinterpret_cast<CodePtr>(jit_state.rsb_codeptrs[new_rsb_ptr]);
+ }
+
+ return GetCurrentBlock();
+ }();
+
+ const HaltReason hr = block_of_code.RunCode(&jit_state, current_code_ptr);
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ HaltReason Step() {
+ ASSERT(!is_executing);
+ PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason)));
+
+ is_executing = true;
+ SCOPE_EXIT {
+ this->is_executing = false;
+ };
+
+ const HaltReason hr = block_of_code.StepCode(&jit_state, GetCurrentSingleStep());
+
+ PerformRequestedCacheInvalidation(hr);
+
+ return hr;
+ }
+
+ void ClearCache() {
+ std::unique_lock lock{invalidation_mutex};
+ invalidate_entire_cache = true;
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void InvalidateCacheRange(u64 start_address, size_t length) {
+ std::unique_lock lock{invalidation_mutex};
+ const auto end_address = static_cast<u64>(start_address + length - 1);
+ const auto range = boost::icl::discrete_interval<u64>::closed(start_address, end_address);
+ invalid_cache_ranges.add(range);
+ HaltExecution(HaltReason::CacheInvalidation);
+ }
+
+ void Reset() {
+ ASSERT(!is_executing);
+ jit_state = {};
+ }
+
+ void HaltExecution(HaltReason hr) {
+ Atomic::Or(&jit_state.halt_reason, static_cast<u32>(hr));
+ }
+
+ void ClearHalt(HaltReason hr) {
+ Atomic::And(&jit_state.halt_reason, ~static_cast<u32>(hr));
+ }
+
+ u64 GetSP() const {
+ return jit_state.sp;
+ }
+
+ void SetSP(u64 value) {
+ jit_state.sp = value;
+ }
+
+ u64 GetPC() const {
+ return jit_state.pc;
+ }
+
+ void SetPC(u64 value) {
+ jit_state.pc = value;
+ }
+
+ u64 GetRegister(size_t index) const {
+ if (index == 31)
+ return GetSP();
+ return jit_state.reg.at(index);
+ }
+
+ void SetRegister(size_t index, u64 value) {
+ if (index == 31)
+ return SetSP(value);
+ jit_state.reg.at(index) = value;
+ }
+
+ std::array<u64, 31> GetRegisters() const {
+ return jit_state.reg;
+ }
+
+ void SetRegisters(const std::array<u64, 31>& value) {
+ jit_state.reg = value;
+ }
+
+ Vector GetVector(size_t index) const {
+ return {jit_state.vec.at(index * 2), jit_state.vec.at(index * 2 + 1)};
+ }
+
+ void SetVector(size_t index, Vector value) {
+ jit_state.vec.at(index * 2) = value[0];
+ jit_state.vec.at(index * 2 + 1) = value[1];
+ }
+
+ std::array<Vector, 32> GetVectors() const {
+ std::array<Vector, 32> ret;
+ static_assert(sizeof(ret) == sizeof(jit_state.vec));
+ std::memcpy(ret.data(), jit_state.vec.data(), sizeof(jit_state.vec));
+ return ret;
+ }
+
+ void SetVectors(const std::array<Vector, 32>& value) {
+ static_assert(sizeof(value) == sizeof(jit_state.vec));
+ std::memcpy(jit_state.vec.data(), value.data(), sizeof(jit_state.vec));
+ }
+
+ u32 GetFpcr() const {
+ return jit_state.GetFpcr();
+ }
+
+ void SetFpcr(u32 value) {
+ jit_state.SetFpcr(value);
+ }
+
+ u32 GetFpsr() const {
+ return jit_state.GetFpsr();
+ }
+
+ void SetFpsr(u32 value) {
+ jit_state.SetFpsr(value);
+ }
+
+ u32 GetPstate() const {
+ return jit_state.GetPstate();
+ }
+
+ void SetPstate(u32 value) {
+ jit_state.SetPstate(value);
+ }
+
+ void ClearExclusiveState() {
+ jit_state.exclusive_state = 0;
+ }
+
+ bool IsExecuting() const {
+ return is_executing;
+ }
+
+ void DumpDisassembly() const {
+ const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
+ Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size);
+ }
+
+ std::vector<std::string> Disassemble() const {
+ const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
+ return Common::DisassembleX64(block_of_code.GetCodeBegin(), size);
+ }
+
+private:
+ static CodePtr GetCurrentBlockThunk(void* thisptr) {
+ Jit::Impl* this_ = static_cast<Jit::Impl*>(thisptr);
+ return this_->GetCurrentBlock();
+ }
+
+ IR::LocationDescriptor GetCurrentLocation() const {
+ return IR::LocationDescriptor{jit_state.GetUniqueHash()};
+ }
+
+ CodePtr GetCurrentBlock() {
+ return GetBlock(GetCurrentLocation());
+ }
+
+ CodePtr GetCurrentSingleStep() {
+ return GetBlock(A64::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true));
+ }
+
+ CodePtr GetBlock(IR::LocationDescriptor current_location) {
+ if (auto block = emitter.GetBasicBlock(current_location))
+ return block->entrypoint;
+
+ constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024;
+ if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) {
+ // Immediately evacuate cache
+ invalidate_entire_cache = true;
+ PerformRequestedCacheInvalidation(HaltReason::CacheInvalidation);
+ }
+ block_of_code.EnsureMemoryCommitted(MINIMUM_REMAINING_CODESIZE);
+
+ // JIT Compile
+ const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); };
+ IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code,
+ {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct});
+ Optimization::PolyfillPass(ir_block, polyfill_options);
+ Optimization::A64CallbackConfigPass(ir_block, conf);
+ Optimization::NamingPass(ir_block);
+ if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) {
+ Optimization::A64GetSetElimination(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::ConstProp)) {
+ Optimization::ConstantPropagation(ir_block);
+ Optimization::DeadCodeElimination(ir_block);
+ }
+ if (conf.HasOptimization(OptimizationFlag::MiscIROpt)) {
+ Optimization::A64MergeInterpretBlocksPass(ir_block, conf.callbacks);
+ }
+ Optimization::VerificationPass(ir_block);
+ return emitter.Emit(ir_block).entrypoint;
+ }
+
+ void PerformRequestedCacheInvalidation(HaltReason hr) {
+ if (Has(hr, HaltReason::CacheInvalidation)) {
+ std::unique_lock lock{invalidation_mutex};
+
+ ClearHalt(HaltReason::CacheInvalidation);
+
+ if (!invalidate_entire_cache && invalid_cache_ranges.empty()) {
+ return;
+ }
+
+ jit_state.ResetRSB();
+ if (invalidate_entire_cache) {
+ block_of_code.ClearCache();
+ emitter.ClearCache();
+ } else {
+ emitter.InvalidateCacheRanges(invalid_cache_ranges);
+ }
+ invalid_cache_ranges.clear();
+ invalidate_entire_cache = false;
+ }
+ }
+
+ bool is_executing = false;
+
+ const UserConfig conf;
+ A64JitState jit_state;
+ BlockOfCode block_of_code;
+ A64EmitX64 emitter;
+ Optimization::PolyfillOptions polyfill_options;
+
+ bool invalidate_entire_cache = false;
+ boost::icl::interval_set<u64> invalid_cache_ranges;
+ std::mutex invalidation_mutex;
+};
+
+Jit::Jit(UserConfig conf)
+ : impl(std::make_unique<Jit::Impl>(this, conf)) {}
+
+Jit::~Jit() = default;
+
+HaltReason Jit::Run() {
+ return impl->Run();
+}
+
+HaltReason Jit::Step() {
+ return impl->Step();
+}
+
+void Jit::ClearCache() {
+ impl->ClearCache();
+}
+
+void Jit::InvalidateCacheRange(u64 start_address, size_t length) {
+ impl->InvalidateCacheRange(start_address, length);
+}
+
+void Jit::Reset() {
+ impl->Reset();
+}
+
+void Jit::HaltExecution(HaltReason hr) {
+ impl->HaltExecution(hr);
+}
+
+void Jit::ClearHalt(HaltReason hr) {
+ impl->ClearHalt(hr);
+}
+
+u64 Jit::GetSP() const {
+ return impl->GetSP();
+}
+
+void Jit::SetSP(u64 value) {
+ impl->SetSP(value);
+}
+
+u64 Jit::GetPC() const {
+ return impl->GetPC();
+}
+
+void Jit::SetPC(u64 value) {
+ impl->SetPC(value);
+}
+
+u64 Jit::GetRegister(size_t index) const {
+ return impl->GetRegister(index);
+}
+
+void Jit::SetRegister(size_t index, u64 value) {
+ impl->SetRegister(index, value);
+}
+
+std::array<u64, 31> Jit::GetRegisters() const {
+ return impl->GetRegisters();
+}
+
+void Jit::SetRegisters(const std::array<u64, 31>& value) {
+ impl->SetRegisters(value);
+}
+
+Vector Jit::GetVector(size_t index) const {
+ return impl->GetVector(index);
+}
+
+void Jit::SetVector(size_t index, Vector value) {
+ impl->SetVector(index, value);
+}
+
+std::array<Vector, 32> Jit::GetVectors() const {
+ return impl->GetVectors();
+}
+
+void Jit::SetVectors(const std::array<Vector, 32>& value) {
+ impl->SetVectors(value);
+}
+
+u32 Jit::GetFpcr() const {
+ return impl->GetFpcr();
+}
+
+void Jit::SetFpcr(u32 value) {
+ impl->SetFpcr(value);
+}
+
+u32 Jit::GetFpsr() const {
+ return impl->GetFpsr();
+}
+
+void Jit::SetFpsr(u32 value) {
+ impl->SetFpsr(value);
+}
+
+u32 Jit::GetPstate() const {
+ return impl->GetPstate();
+}
+
+void Jit::SetPstate(u32 value) {
+ impl->SetPstate(value);
+}
+
+void Jit::ClearExclusiveState() {
+ impl->ClearExclusiveState();
+}
+
+bool Jit::IsExecuting() const {
+ return impl->IsExecuting();
+}
+
+void Jit::DumpDisassembly() const {
+ return impl->DumpDisassembly();
+}
+
+std::vector<std::string> Jit::Disassemble() const {
+ return impl->Disassemble();
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp
new file mode 100644
index 0000000000..9f983f3955
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp
@@ -0,0 +1,116 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/a64_jitstate.h"
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+
+namespace Dynarmic::Backend::X64 {
+
+/**
+ * Comparing MXCSR and FPCR
+ * ========================
+ *
+ * SSE MSCSR exception masks
+ * -------------------------
+ * PM bit 12 Precision Mask
+ * UM bit 11 Underflow Mask
+ * OM bit 10 Overflow Mask
+ * ZM bit 9 Divide By Zero Mask
+ * DM bit 8 Denormal Mask
+ * IM bit 7 Invalid Operation Mask
+ *
+ * A64 FPCR exception trap enables
+ * -------------------------------
+ * IDE bit 15 Input Denormal exception trap enable
+ * IXE bit 12 Inexact exception trap enable
+ * UFE bit 11 Underflow exception trap enable
+ * OFE bit 10 Overflow exception trap enable
+ * DZE bit 9 Division by Zero exception trap enable
+ * IOE bit 8 Invalid Operation exception trap enable
+ *
+ * SSE MXCSR mode bits
+ * -------------------
+ * FZ bit 15 Flush To Zero
+ * DAZ bit 6 Denormals Are Zero
+ * RN bits 13-14 Round to {0 = Nearest, 1 = Negative, 2 = Positive, 3 = Zero}
+ *
+ * A64 FPCR mode bits
+ * ------------------
+ * AHP bit 26 Alternative half-precision
+ * DN bit 25 Default NaN
+ * FZ bit 24 Flush to Zero
+ * RMode bits 22-23 Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero}
+ * FZ16 bit 19 Flush to Zero for half-precision
+ */
+
+constexpr u32 FPCR_MASK = 0x07C89F00;
+
+u32 A64JitState::GetFpcr() const {
+ return fpcr;
+}
+
+void A64JitState::SetFpcr(u32 value) {
+ fpcr = value & FPCR_MASK;
+
+ asimd_MXCSR &= 0x0000003D;
+ guest_MXCSR &= 0x0000003D;
+ asimd_MXCSR |= 0x00001f80;
+ guest_MXCSR |= 0x00001f80; // Mask all exceptions
+
+ // RMode
+ const std::array<u32, 4> MXCSR_RMode{0x0, 0x4000, 0x2000, 0x6000};
+ guest_MXCSR |= MXCSR_RMode[(value >> 22) & 0x3];
+
+ if (mcl::bit::get_bit<24>(value)) {
+ guest_MXCSR |= (1 << 15); // SSE Flush to Zero
+ guest_MXCSR |= (1 << 6); // SSE Denormals are Zero
+ }
+}
+
+/**
+ * Comparing MXCSR and FPSR
+ * ========================
+ *
+ * SSE MXCSR exception flags
+ * -------------------------
+ * PE bit 5 Precision Flag
+ * UE bit 4 Underflow Flag
+ * OE bit 3 Overflow Flag
+ * ZE bit 2 Divide By Zero Flag
+ * DE bit 1 Denormal Flag // Appears to only be set when MXCSR.DAZ = 0
+ * IE bit 0 Invalid Operation Flag
+ *
+ * A64 FPSR cumulative exception bits
+ * ----------------------------------
+ * QC bit 27 Cumulative saturation bit
+ * IDC bit 7 Input Denormal cumulative exception bit // Only ever set when FPCR.FTZ = 1
+ * IXC bit 4 Inexact cumulative exception bit
+ * UFC bit 3 Underflow cumulative exception bit
+ * OFC bit 2 Overflow cumulative exception bit
+ * DZC bit 1 Division by Zero cumulative exception bit
+ * IOC bit 0 Invalid Operation cumulative exception bit
+ */
+
+u32 A64JitState::GetFpsr() const {
+ const u32 mxcsr = guest_MXCSR | asimd_MXCSR;
+ u32 fpsr = 0;
+ fpsr |= (mxcsr & 0b0000000000001); // IOC = IE
+ fpsr |= (mxcsr & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE
+ fpsr |= fpsr_exc;
+ fpsr |= (fpsr_qc == 0 ? 0 : 1) << 27;
+ return fpsr;
+}
+
+void A64JitState::SetFpsr(u32 value) {
+ guest_MXCSR &= ~0x0000003D;
+ asimd_MXCSR &= ~0x0000003D;
+ fpsr_qc = (value >> 27) & 1;
+ fpsr_exc = value & 0x9F;
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h
new file mode 100644
index 0000000000..0929e81ec5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h
@@ -0,0 +1,84 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/nzcv_util.h"
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+
+namespace Dynarmic::Backend::X64 {
+
+class BlockOfCode;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+struct A64JitState {
+ using ProgramCounterType = u64;
+
+ A64JitState() { ResetRSB(); }
+
+ std::array<u64, 31> reg{};
+ u64 sp = 0;
+ u64 pc = 0;
+
+ u32 cpsr_nzcv = 0;
+
+ u32 GetPstate() const {
+ return NZCV::FromX64(cpsr_nzcv);
+ }
+ void SetPstate(u32 new_pstate) {
+ cpsr_nzcv = NZCV::ToX64(new_pstate);
+ }
+
+ alignas(16) std::array<u64, 64> vec{}; // Extension registers.
+
+ // For internal use (See: BlockOfCode::RunCode)
+ u32 guest_MXCSR = 0x00001f80;
+ u32 asimd_MXCSR = 0x00009fc0;
+ volatile u32 halt_reason = 0;
+
+ // Exclusive state
+ static constexpr u64 RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFF0ull;
+ u8 exclusive_state = 0;
+
+ static constexpr size_t RSBSize = 8; // MUST be a power of 2.
+ static constexpr size_t RSBPtrMask = RSBSize - 1;
+ u32 rsb_ptr = 0;
+ std::array<u64, RSBSize> rsb_location_descriptors;
+ std::array<u64, RSBSize> rsb_codeptrs;
+ void ResetRSB() {
+ rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull);
+ rsb_codeptrs.fill(0);
+ }
+
+ u32 fpsr_exc = 0;
+ u32 fpsr_qc = 0;
+ u32 fpcr = 0;
+ u32 GetFpcr() const;
+ u32 GetFpsr() const;
+ void SetFpcr(u32 value);
+ void SetFpsr(u32 value);
+
+ u64 GetUniqueHash() const noexcept {
+ const u64 fpcr_u64 = static_cast<u64>(fpcr & A64::LocationDescriptor::fpcr_mask) << A64::LocationDescriptor::fpcr_shift;
+ const u64 pc_u64 = pc & A64::LocationDescriptor::pc_mask;
+ return pc_u64 | fpcr_u64;
+ }
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+using CodePtr = const void*;
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp b/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp
new file mode 100644
index 0000000000..d6a83b65b4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp
@@ -0,0 +1,135 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/abi.h"
+
+#include <algorithm>
+#include <vector>
+
+#include <mcl/iterator/reverse.hpp>
+#include <mcl/stdint.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+
+namespace Dynarmic::Backend::X64 {
+
+constexpr size_t XMM_SIZE = 16;
+
+struct FrameInfo {
+ size_t stack_subtraction;
+ size_t xmm_offset;
+ size_t frame_offset;
+};
+
+static FrameInfo CalculateFrameInfo(size_t num_gprs, size_t num_xmms, size_t frame_size) {
+ // We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call.
+ const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0;
+ const size_t total_xmm_size = num_xmms * XMM_SIZE;
+
+ if (frame_size & 0xF) {
+ frame_size += 0x10 - (frame_size & 0xF);
+ }
+
+ return {
+ rsp_alignment + total_xmm_size + frame_size + ABI_SHADOW_SPACE,
+ frame_size + ABI_SHADOW_SPACE,
+ ABI_SHADOW_SPACE,
+ };
+}
+
+template<typename RegisterArrayT>
+void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const RegisterArrayT& regs) {
+ using namespace Xbyak::util;
+
+ const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR);
+ const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM);
+
+ FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size);
+
+ for (HostLoc gpr : regs) {
+ if (HostLocIsGPR(gpr)) {
+ code.push(HostLocToReg64(gpr));
+ }
+ }
+
+ if (frame_info.stack_subtraction != 0) {
+ code.sub(rsp, u32(frame_info.stack_subtraction));
+ }
+
+ size_t xmm_offset = frame_info.xmm_offset;
+ for (HostLoc xmm : regs) {
+ if (HostLocIsXMM(xmm)) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
+ } else {
+ code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
+ }
+ xmm_offset += XMM_SIZE;
+ }
+ }
+}
+
+template<typename RegisterArrayT>
+void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const RegisterArrayT& regs) {
+ using namespace Xbyak::util;
+
+ const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR);
+ const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM);
+
+ FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size);
+
+ size_t xmm_offset = frame_info.xmm_offset;
+ for (HostLoc xmm : regs) {
+ if (HostLocIsXMM(xmm)) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
+ } else {
+ code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
+ }
+ xmm_offset += XMM_SIZE;
+ }
+ }
+
+ if (frame_info.stack_subtraction != 0) {
+ code.add(rsp, u32(frame_info.stack_subtraction));
+ }
+
+ for (HostLoc gpr : mcl::iterator::reverse(regs)) {
+ if (HostLocIsGPR(gpr)) {
+ code.pop(HostLocToReg64(gpr));
+ }
+ }
+}
+
+void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) {
+ ABI_PushRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLEE_SAVE);
+}
+
+void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) {
+ ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLEE_SAVE);
+}
+
+void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) {
+ ABI_PushRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE);
+}
+
+void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) {
+ ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE);
+}
+
+void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
+ std::vector<HostLoc> regs;
+ std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
+ ABI_PushRegistersAndAdjustStack(code, 0, regs);
+}
+
+void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
+ std::vector<HostLoc> regs;
+ std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
+ ABI_PopRegistersAndAdjustStack(code, 0, regs);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/abi.h b/externals/dynarmic/src/dynarmic/backend/x64/abi.h
new file mode 100644
index 0000000000..4bddf51bad
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/abi.h
@@ -0,0 +1,132 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/hostloc.h"
+
+namespace Dynarmic::Backend::X64 {
+
+class BlockOfCode;
+
+#ifdef _WIN32
+
+constexpr HostLoc ABI_RETURN = HostLoc::RAX;
+
+constexpr size_t ABI_PARAM_COUNT = 4;
+
+constexpr HostLoc ABI_PARAM1 = HostLoc::RCX;
+constexpr HostLoc ABI_PARAM2 = HostLoc::RDX;
+constexpr HostLoc ABI_PARAM3 = HostLoc::R8;
+constexpr HostLoc ABI_PARAM4 = HostLoc::R9;
+
+constexpr std::array<HostLoc, 13> ABI_ALL_CALLER_SAVE = {
+ HostLoc::RAX,
+ HostLoc::RCX,
+ HostLoc::RDX,
+ HostLoc::R8,
+ HostLoc::R9,
+ HostLoc::R10,
+ HostLoc::R11,
+ HostLoc::XMM0,
+ HostLoc::XMM1,
+ HostLoc::XMM2,
+ HostLoc::XMM3,
+ HostLoc::XMM4,
+ HostLoc::XMM5,
+};
+
+constexpr std::array<HostLoc, 18> ABI_ALL_CALLEE_SAVE = {
+ HostLoc::RBX,
+ HostLoc::RSI,
+ HostLoc::RDI,
+ HostLoc::RBP,
+ HostLoc::R12,
+ HostLoc::R13,
+ HostLoc::R14,
+ HostLoc::R15,
+ HostLoc::XMM6,
+ HostLoc::XMM7,
+ HostLoc::XMM8,
+ HostLoc::XMM9,
+ HostLoc::XMM10,
+ HostLoc::XMM11,
+ HostLoc::XMM12,
+ HostLoc::XMM13,
+ HostLoc::XMM14,
+ HostLoc::XMM15,
+};
+
+constexpr size_t ABI_SHADOW_SPACE = 32; // bytes
+
+#else
+
+constexpr HostLoc ABI_RETURN = HostLoc::RAX;
+constexpr HostLoc ABI_RETURN2 = HostLoc::RDX;
+
+constexpr size_t ABI_PARAM_COUNT = 6;
+
+constexpr HostLoc ABI_PARAM1 = HostLoc::RDI;
+constexpr HostLoc ABI_PARAM2 = HostLoc::RSI;
+constexpr HostLoc ABI_PARAM3 = HostLoc::RDX;
+constexpr HostLoc ABI_PARAM4 = HostLoc::RCX;
+constexpr HostLoc ABI_PARAM5 = HostLoc::R8;
+constexpr HostLoc ABI_PARAM6 = HostLoc::R9;
+
+constexpr std::array<HostLoc, 25> ABI_ALL_CALLER_SAVE = {
+ HostLoc::RAX,
+ HostLoc::RCX,
+ HostLoc::RDX,
+ HostLoc::RDI,
+ HostLoc::RSI,
+ HostLoc::R8,
+ HostLoc::R9,
+ HostLoc::R10,
+ HostLoc::R11,
+ HostLoc::XMM0,
+ HostLoc::XMM1,
+ HostLoc::XMM2,
+ HostLoc::XMM3,
+ HostLoc::XMM4,
+ HostLoc::XMM5,
+ HostLoc::XMM6,
+ HostLoc::XMM7,
+ HostLoc::XMM8,
+ HostLoc::XMM9,
+ HostLoc::XMM10,
+ HostLoc::XMM11,
+ HostLoc::XMM12,
+ HostLoc::XMM13,
+ HostLoc::XMM14,
+ HostLoc::XMM15,
+};
+
+constexpr std::array<HostLoc, 6> ABI_ALL_CALLEE_SAVE = {
+ HostLoc::RBX,
+ HostLoc::RBP,
+ HostLoc::R12,
+ HostLoc::R13,
+ HostLoc::R14,
+ HostLoc::R15,
+};
+
+constexpr size_t ABI_SHADOW_SPACE = 0; // bytes
+
+#endif
+
+static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 31, "Invalid total number of registers");
+
+void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);
+void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);
+void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);
+void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);
+
+void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception);
+void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception);
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
new file mode 100644
index 0000000000..22d9868fc5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
@@ -0,0 +1,540 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/block_of_code.h"
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#else
+# include <sys/mman.h>
+#endif
+
+#ifdef __APPLE__
+# include <errno.h>
+# include <fmt/format.h>
+# include <sys/sysctl.h>
+#endif
+
+#include <array>
+#include <cstring>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/a32_jitstate.h"
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/hostloc.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+
+namespace Dynarmic::Backend::X64 {
+
+#ifdef _WIN32
+const Xbyak::Reg64 BlockOfCode::ABI_RETURN = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM1 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM1);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM2);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM3 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM3);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM4 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM4);
+const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2, BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4};
+#else
+const Xbyak::Reg64 BlockOfCode::ABI_RETURN = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN);
+const Xbyak::Reg64 BlockOfCode::ABI_RETURN2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN2);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM1 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM1);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM2);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM3 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM3);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM4 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM4);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM5 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM5);
+const Xbyak::Reg64 BlockOfCode::ABI_PARAM6 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM6);
+const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2, BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4, BlockOfCode::ABI_PARAM5, BlockOfCode::ABI_PARAM6};
+#endif
+
+namespace {
+
+constexpr size_t CONSTANT_POOL_SIZE = 2 * 1024 * 1024;
+constexpr size_t PRELUDE_COMMIT_SIZE = 16 * 1024 * 1024;
+
+class CustomXbyakAllocator : public Xbyak::Allocator {
+public:
+#ifdef _WIN32
+ uint8_t* alloc(size_t size) override {
+ void* p = VirtualAlloc(nullptr, size, MEM_RESERVE, PAGE_READWRITE);
+ if (p == nullptr) {
+ throw Xbyak::Error(Xbyak::ERR_CANT_ALLOC);
+ }
+ return static_cast<uint8_t*>(p);
+ }
+
+ void free(uint8_t* p) override {
+ VirtualFree(static_cast<void*>(p), 0, MEM_RELEASE);
+ }
+
+ bool useProtect() const override { return false; }
+#else
+ static constexpr size_t DYNARMIC_PAGE_SIZE = 4096;
+
+ // Can't subclass Xbyak::MmapAllocator because it is not a pure interface
+ // and doesn't expose its construtor
+ uint8_t* alloc(size_t size) override {
+ // Waste a page to store the size
+ size += DYNARMIC_PAGE_SIZE;
+
+# if defined(MAP_ANONYMOUS)
+ int mode = MAP_PRIVATE | MAP_ANONYMOUS;
+# elif defined(MAP_ANON)
+ int mode = MAP_PRIVATE | MAP_ANON;
+# else
+# error "not supported"
+# endif
+# ifdef MAP_JIT
+ mode |= MAP_JIT;
+# endif
+
+ void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE, mode, -1, 0);
+ if (p == MAP_FAILED) {
+ throw Xbyak::Error(Xbyak::ERR_CANT_ALLOC);
+ }
+ std::memcpy(p, &size, sizeof(size_t));
+ return static_cast<uint8_t*>(p) + DYNARMIC_PAGE_SIZE;
+ }
+
+ void free(uint8_t* p) override {
+ size_t size;
+ std::memcpy(&size, p - DYNARMIC_PAGE_SIZE, sizeof(size_t));
+ munmap(p - DYNARMIC_PAGE_SIZE, size);
+ }
+
+# ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+ bool useProtect() const override { return false; }
+# endif
+#endif
+};
+
+// This is threadsafe as Xbyak::Allocator does not contain any state; it is a pure interface.
+CustomXbyakAllocator s_allocator;
+
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+void ProtectMemory(const void* base, size_t size, bool is_executable) {
+# ifdef _WIN32
+ DWORD oldProtect = 0;
+ VirtualProtect(const_cast<void*>(base), size, is_executable ? PAGE_EXECUTE_READ : PAGE_READWRITE, &oldProtect);
+# else
+ static const size_t pageSize = sysconf(_SC_PAGESIZE);
+ const size_t iaddr = reinterpret_cast<size_t>(base);
+ const size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+ const int mode = is_executable ? (PROT_READ | PROT_EXEC) : (PROT_READ | PROT_WRITE);
+ mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode);
+# endif
+}
+#endif
+
+HostFeature GetHostFeatures() {
+ HostFeature features = {};
+
+#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION
+ using Cpu = Xbyak::util::Cpu;
+ Xbyak::util::Cpu cpu_info;
+
+ if (cpu_info.has(Cpu::tSSSE3))
+ features |= HostFeature::SSSE3;
+ if (cpu_info.has(Cpu::tSSE41))
+ features |= HostFeature::SSE41;
+ if (cpu_info.has(Cpu::tSSE42))
+ features |= HostFeature::SSE42;
+ if (cpu_info.has(Cpu::tAVX))
+ features |= HostFeature::AVX;
+ if (cpu_info.has(Cpu::tAVX2))
+ features |= HostFeature::AVX2;
+ if (cpu_info.has(Cpu::tAVX512F))
+ features |= HostFeature::AVX512F;
+ if (cpu_info.has(Cpu::tAVX512CD))
+ features |= HostFeature::AVX512CD;
+ if (cpu_info.has(Cpu::tAVX512VL))
+ features |= HostFeature::AVX512VL;
+ if (cpu_info.has(Cpu::tAVX512BW))
+ features |= HostFeature::AVX512BW;
+ if (cpu_info.has(Cpu::tAVX512DQ))
+ features |= HostFeature::AVX512DQ;
+ if (cpu_info.has(Cpu::tAVX512_BITALG))
+ features |= HostFeature::AVX512BITALG;
+ if (cpu_info.has(Cpu::tAVX512VBMI))
+ features |= HostFeature::AVX512VBMI;
+ if (cpu_info.has(Cpu::tPCLMULQDQ))
+ features |= HostFeature::PCLMULQDQ;
+ if (cpu_info.has(Cpu::tF16C))
+ features |= HostFeature::F16C;
+ if (cpu_info.has(Cpu::tFMA))
+ features |= HostFeature::FMA;
+ if (cpu_info.has(Cpu::tAESNI))
+ features |= HostFeature::AES;
+ if (cpu_info.has(Cpu::tSHA))
+ features |= HostFeature::SHA;
+ if (cpu_info.has(Cpu::tPOPCNT))
+ features |= HostFeature::POPCNT;
+ if (cpu_info.has(Cpu::tBMI1))
+ features |= HostFeature::BMI1;
+ if (cpu_info.has(Cpu::tBMI2))
+ features |= HostFeature::BMI2;
+ if (cpu_info.has(Cpu::tLZCNT))
+ features |= HostFeature::LZCNT;
+ if (cpu_info.has(Cpu::tGFNI))
+ features |= HostFeature::GFNI;
+
+ if (cpu_info.has(Cpu::tBMI2)) {
+ // BMI2 instructions such as pdep and pext have been very slow up until Zen 3.
+ // Check for Zen 3 or newer by its family (0x19).
+ // See also: https://en.wikichip.org/wiki/amd/cpuid
+ if (cpu_info.has(Cpu::tAMD)) {
+ std::array<u32, 4> data{};
+ cpu_info.getCpuid(1, data.data());
+ const u32 family_base = mcl::bit::get_bits<8, 11>(data[0]);
+ const u32 family_extended = mcl::bit::get_bits<20, 27>(data[0]);
+ const u32 family = family_base + family_extended;
+ if (family >= 0x19)
+ features |= HostFeature::FastBMI2;
+ } else {
+ features |= HostFeature::FastBMI2;
+ }
+ }
+#endif
+
+ return features;
+}
+
+#ifdef __APPLE__
+bool IsUnderRosetta() {
+ int result = 0;
+ size_t result_size = sizeof(result);
+ if (sysctlbyname("sysctl.proc_translated", &result, &result_size, nullptr, 0) == -1) {
+ if (errno != ENOENT)
+ fmt::print("IsUnderRosetta: Failed to detect Rosetta state, assuming not under Rosetta");
+ return false;
+ }
+ return result != 0;
+}
+#endif
+
+} // anonymous namespace
+
+BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, std::function<void(BlockOfCode&)> rcp)
+ : Xbyak::CodeGenerator(total_code_size, nullptr, &s_allocator)
+ , cb(std::move(cb))
+ , jsi(jsi)
+ , constant_pool(*this, CONSTANT_POOL_SIZE)
+ , host_features(GetHostFeatures()) {
+ EnableWriting();
+ EnsureMemoryCommitted(PRELUDE_COMMIT_SIZE);
+ GenRunCode(rcp);
+}
+
+void BlockOfCode::PreludeComplete() {
+ prelude_complete = true;
+ code_begin = getCurr();
+ ClearCache();
+ DisableWriting();
+}
+
+void BlockOfCode::EnableWriting() {
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+# ifdef _WIN32
+ ProtectMemory(getCode(), committed_size, false);
+# else
+ ProtectMemory(getCode(), maxSize_, false);
+# endif
+#endif
+}
+
+void BlockOfCode::DisableWriting() {
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+# ifdef _WIN32
+ ProtectMemory(getCode(), committed_size, true);
+# else
+ ProtectMemory(getCode(), maxSize_, true);
+# endif
+#endif
+}
+
+void BlockOfCode::ClearCache() {
+ ASSERT(prelude_complete);
+ SetCodePtr(code_begin);
+}
+
+size_t BlockOfCode::SpaceRemaining() const {
+ ASSERT(prelude_complete);
+ const u8* current_ptr = getCurr<const u8*>();
+ if (current_ptr >= &top_[maxSize_])
+ return 0;
+ return &top_[maxSize_] - current_ptr;
+}
+
+void BlockOfCode::EnsureMemoryCommitted([[maybe_unused]] size_t codesize) {
+#ifdef _WIN32
+ if (committed_size < size_ + codesize) {
+ committed_size = std::min<size_t>(maxSize_, committed_size + codesize);
+# ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+ VirtualAlloc(top_, committed_size, MEM_COMMIT, PAGE_READWRITE);
+# else
+ VirtualAlloc(top_, committed_size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+# endif
+ }
+#endif
+}
+
+HaltReason BlockOfCode::RunCode(void* jit_state, CodePtr code_ptr) const {
+ return run_code(jit_state, code_ptr);
+}
+
+HaltReason BlockOfCode::StepCode(void* jit_state, CodePtr code_ptr) const {
+ return step_code(jit_state, code_ptr);
+}
+
+void BlockOfCode::ReturnFromRunCode(bool mxcsr_already_exited) {
+ size_t index = 0;
+ if (mxcsr_already_exited)
+ index |= MXCSR_ALREADY_EXITED;
+ jmp(return_from_run_code[index]);
+}
+
+void BlockOfCode::ForceReturnFromRunCode(bool mxcsr_already_exited) {
+ size_t index = FORCE_RETURN;
+ if (mxcsr_already_exited)
+ index |= MXCSR_ALREADY_EXITED;
+ jmp(return_from_run_code[index]);
+}
+
+void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
+ Xbyak::Label return_to_caller, return_to_caller_mxcsr_already_exited;
+
+ align();
+ run_code = getCurr<RunCodeFuncType>();
+
+ // This serves two purposes:
+ // 1. It saves all the registers we as a callee need to save.
+ // 2. It aligns the stack so that the code the JIT emits can assume
+ // that the stack is appropriately aligned for CALLs.
+ ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
+
+ mov(r15, ABI_PARAM1);
+ mov(rbx, ABI_PARAM2); // save temporarily in non-volatile register
+
+ if (cb.enable_cycle_counting) {
+ cb.GetTicksRemaining->EmitCall(*this);
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], ABI_RETURN);
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], ABI_RETURN);
+ }
+
+ rcp(*this);
+
+ cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
+ jne(return_to_caller_mxcsr_already_exited, T_NEAR);
+
+ SwitchMxcsrOnEntry();
+ jmp(rbx);
+
+ align();
+ step_code = getCurr<RunCodeFuncType>();
+
+ ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
+
+ mov(r15, ABI_PARAM1);
+
+ if (cb.enable_cycle_counting) {
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], 1);
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 1);
+ }
+
+ rcp(*this);
+
+ cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
+ jne(return_to_caller_mxcsr_already_exited, T_NEAR);
+ lock();
+ or_(dword[r15 + jsi.offsetof_halt_reason], static_cast<u32>(HaltReason::Step));
+
+ SwitchMxcsrOnEntry();
+ jmp(ABI_PARAM2);
+
+ // Dispatcher loop
+
+ align();
+ return_from_run_code[0] = getCurr<const void*>();
+
+ cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
+ jne(return_to_caller);
+ if (cb.enable_cycle_counting) {
+ cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
+ jng(return_to_caller);
+ }
+ cb.LookupBlock->EmitCall(*this);
+ jmp(ABI_RETURN);
+
+ align();
+ return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>();
+
+ cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
+ jne(return_to_caller_mxcsr_already_exited);
+ if (cb.enable_cycle_counting) {
+ cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
+ jng(return_to_caller_mxcsr_already_exited);
+ }
+ SwitchMxcsrOnEntry();
+ cb.LookupBlock->EmitCall(*this);
+ jmp(ABI_RETURN);
+
+ align();
+ return_from_run_code[FORCE_RETURN] = getCurr<const void*>();
+ L(return_to_caller);
+
+ SwitchMxcsrOnExit();
+ // fallthrough
+
+ return_from_run_code[MXCSR_ALREADY_EXITED | FORCE_RETURN] = getCurr<const void*>();
+ L(return_to_caller_mxcsr_already_exited);
+
+ if (cb.enable_cycle_counting) {
+ cb.AddTicks->EmitCall(*this, [this](RegList param) {
+ mov(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
+ sub(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
+ });
+ }
+
+ xor_(eax, eax);
+ lock();
+ xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
+
+ ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
+ ret();
+
+ PerfMapRegister(run_code, getCurr(), "dynarmic_dispatcher");
+}
+
+void BlockOfCode::SwitchMxcsrOnEntry() {
+ stmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]);
+ ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
+}
+
+void BlockOfCode::SwitchMxcsrOnExit() {
+ stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
+ ldmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]);
+}
+
+void BlockOfCode::EnterStandardASIMD() {
+ stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
+ ldmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]);
+}
+
+void BlockOfCode::LeaveStandardASIMD() {
+ stmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]);
+ ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]);
+}
+
+void BlockOfCode::UpdateTicks() {
+ if (!cb.enable_cycle_counting) {
+ return;
+ }
+
+ cb.AddTicks->EmitCall(*this, [this](RegList param) {
+ mov(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]);
+ sub(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]);
+ });
+
+ cb.GetTicksRemaining->EmitCall(*this);
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], ABI_RETURN);
+ mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], ABI_RETURN);
+}
+
+void BlockOfCode::LookupBlock() {
+ cb.LookupBlock->EmitCall(*this);
+}
+
+void BlockOfCode::LoadRequiredFlagsForCondFromRax(IR::Cond cond) {
+#ifdef __APPLE__
+ static const bool is_rosetta = IsUnderRosetta();
+#endif
+
+ // sahf restores SF, ZF, CF
+ // add al, 0x7F restores OF
+
+ switch (cond) {
+ case IR::Cond::EQ: // z
+ case IR::Cond::NE: // !z
+ case IR::Cond::CS: // c
+ case IR::Cond::CC: // !c
+ case IR::Cond::MI: // n
+ case IR::Cond::PL: // !n
+ sahf();
+ break;
+ case IR::Cond::VS: // v
+ case IR::Cond::VC: // !v
+ cmp(al, 0x81);
+ break;
+ case IR::Cond::HI: // c & !z
+ case IR::Cond::LS: // !c | z
+ sahf();
+ cmc();
+ break;
+ case IR::Cond::GE: // n == v
+ case IR::Cond::LT: // n != v
+ case IR::Cond::GT: // !z & (n == v)
+ case IR::Cond::LE: // z | (n != v)
+#ifdef __APPLE__
+ if (is_rosetta) {
+ shl(al, 3);
+ xchg(al, ah);
+ push(rax);
+ popf();
+ break;
+ }
+#endif
+ cmp(al, 0x81);
+ sahf();
+ break;
+ case IR::Cond::AL:
+ case IR::Cond::NV:
+ break;
+ default:
+ ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond));
+ break;
+ }
+}
+
+Xbyak::Address BlockOfCode::Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) {
+ return constant_pool.GetConstant(frame, lower, upper);
+}
+
+CodePtr BlockOfCode::GetCodeBegin() const {
+ return code_begin;
+}
+
+size_t BlockOfCode::GetTotalCodeSize() const {
+ return maxSize_;
+}
+
+void* BlockOfCode::AllocateFromCodeSpace(size_t alloc_size) {
+ if (size_ + alloc_size >= maxSize_) {
+ throw Xbyak::Error(Xbyak::ERR_CODE_IS_TOO_BIG);
+ }
+
+ EnsureMemoryCommitted(alloc_size);
+
+ void* ret = getCurr<void*>();
+ size_ += alloc_size;
+ memset(ret, 0, alloc_size);
+ return ret;
+}
+
+void BlockOfCode::SetCodePtr(CodePtr code_ptr) {
+ // The "size" defines where top_, the insertion point, is.
+ size_t required_size = reinterpret_cast<const u8*>(code_ptr) - getCode();
+ setSize(required_size);
+}
+
+void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) {
+ size_t current_size = getCurr<const u8*>() - reinterpret_cast<const u8*>(begin);
+ ASSERT(current_size <= size);
+ nop(size - current_size);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h
new file mode 100644
index 0000000000..3f7573af25
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h
@@ -0,0 +1,205 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/callback.h"
+#include "dynarmic/backend/x64/constant_pool.h"
+#include "dynarmic/backend/x64/host_feature.h"
+#include "dynarmic/backend/x64/jitstate_info.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/interface/halt_reason.h"
+#include "dynarmic/ir/cond.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using CodePtr = const void*;
+
+struct RunCodeCallbacks {
+ std::unique_ptr<Callback> LookupBlock;
+ std::unique_ptr<Callback> AddTicks;
+ std::unique_ptr<Callback> GetTicksRemaining;
+ bool enable_cycle_counting;
+};
+
+class BlockOfCode final : public Xbyak::CodeGenerator {
+public:
+ BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, std::function<void(BlockOfCode&)> rcp);
+ BlockOfCode(const BlockOfCode&) = delete;
+
+ /// Call when external emitters have finished emitting their preludes.
+ void PreludeComplete();
+
+ /// Change permissions to RW. This is required to support systems with W^X enforced.
+ void EnableWriting();
+ /// Change permissions to RX. This is required to support systems with W^X enforced.
+ void DisableWriting();
+
+ /// Clears this block of code and resets code pointer to beginning.
+ void ClearCache();
+ /// Calculates how much space is remaining to use.
+ size_t SpaceRemaining() const;
+ /// Ensure at least codesize bytes of code cache memory are committed at the current code_ptr.
+ void EnsureMemoryCommitted(size_t codesize);
+
+ /// Runs emulated code from code_ptr.
+ HaltReason RunCode(void* jit_state, CodePtr code_ptr) const;
+ /// Runs emulated code from code_ptr for a single cycle.
+ HaltReason StepCode(void* jit_state, CodePtr code_ptr) const;
+ /// Code emitter: Returns to dispatcher
+ void ReturnFromRunCode(bool mxcsr_already_exited = false);
+ /// Code emitter: Returns to dispatcher, forces return to host
+ void ForceReturnFromRunCode(bool mxcsr_already_exited = false);
+ /// Code emitter: Makes guest MXCSR the current MXCSR
+ void SwitchMxcsrOnEntry();
+ /// Code emitter: Makes saved host MXCSR the current MXCSR
+ void SwitchMxcsrOnExit();
+ /// Code emitter: Enter standard ASIMD MXCSR region
+ void EnterStandardASIMD();
+ /// Code emitter: Leave standard ASIMD MXCSR region
+ void LeaveStandardASIMD();
+ /// Code emitter: Updates cycles remaining my calling cb.AddTicks and cb.GetTicksRemaining
+ /// @note this clobbers ABI caller-save registers
+ void UpdateTicks();
+ /// Code emitter: Performs a block lookup based on current state
+ /// @note this clobbers ABI caller-save registers
+ void LookupBlock();
+
+ /// Code emitter: Load required flags for conditional cond from rax into host rflags
+ void LoadRequiredFlagsForCondFromRax(IR::Cond cond);
+
+ /// Code emitter: Calls the function
+ template<typename FunctionPointer>
+ void CallFunction(FunctionPointer fn) {
+ static_assert(std::is_pointer_v<FunctionPointer> && std::is_function_v<std::remove_pointer_t<FunctionPointer>>,
+ "Supplied type must be a pointer to a function");
+
+ const u64 address = reinterpret_cast<u64>(fn);
+ const u64 distance = address - (getCurr<u64>() + 5);
+
+ if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) {
+ // Far call
+ mov(rax, address);
+ call(rax);
+ } else {
+ call(fn);
+ }
+ }
+
+ /// Code emitter: Calls the lambda. Lambda must not have any captures.
+ template<typename Lambda>
+ void CallLambda(Lambda l) {
+ CallFunction(Common::FptrCast(l));
+ }
+
+ void ZeroExtendFrom(size_t bitsize, Xbyak::Reg64 reg) {
+ switch (bitsize) {
+ case 8:
+ movzx(reg.cvt32(), reg.cvt8());
+ return;
+ case 16:
+ movzx(reg.cvt32(), reg.cvt16());
+ return;
+ case 32:
+ mov(reg.cvt32(), reg.cvt32());
+ return;
+ case 64:
+ return;
+ default:
+ UNREACHABLE();
+ }
+ }
+
+ Xbyak::Address Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0);
+
+ template<size_t esize>
+ Xbyak::Address BConst(const Xbyak::AddressFrame& frame, u64 value) {
+ return Const(frame, mcl::bit::replicate_element<u64>(esize, value),
+ mcl::bit::replicate_element<u64>(esize, value));
+ }
+
+ CodePtr GetCodeBegin() const;
+ size_t GetTotalCodeSize() const;
+
+ const void* GetReturnFromRunCodeAddress() const {
+ return return_from_run_code[0];
+ }
+
+ const void* GetForceReturnFromRunCodeAddress() const {
+ return return_from_run_code[FORCE_RETURN];
+ }
+
+ void int3() { db(0xCC); }
+
+ /// Allocate memory of `size` bytes from the same block of memory the code is in.
+ /// This is useful for objects that need to be placed close to or within code.
+ /// The lifetime of this memory is the same as the code around it.
+ void* AllocateFromCodeSpace(size_t size);
+
+ void SetCodePtr(CodePtr code_ptr);
+ void EnsurePatchLocationSize(CodePtr begin, size_t size);
+
+ // ABI registers
+#ifdef _WIN32
+ static const Xbyak::Reg64 ABI_RETURN;
+ static const Xbyak::Reg64 ABI_PARAM1;
+ static const Xbyak::Reg64 ABI_PARAM2;
+ static const Xbyak::Reg64 ABI_PARAM3;
+ static const Xbyak::Reg64 ABI_PARAM4;
+ static const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> ABI_PARAMS;
+#else
+ static const Xbyak::Reg64 ABI_RETURN;
+ static const Xbyak::Reg64 ABI_RETURN2;
+ static const Xbyak::Reg64 ABI_PARAM1;
+ static const Xbyak::Reg64 ABI_PARAM2;
+ static const Xbyak::Reg64 ABI_PARAM3;
+ static const Xbyak::Reg64 ABI_PARAM4;
+ static const Xbyak::Reg64 ABI_PARAM5;
+ static const Xbyak::Reg64 ABI_PARAM6;
+ static const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> ABI_PARAMS;
+#endif
+
+ JitStateInfo GetJitStateInfo() const { return jsi; }
+
+ bool HasHostFeature(HostFeature feature) const {
+ return (host_features & feature) == feature;
+ }
+
+private:
+ RunCodeCallbacks cb;
+ JitStateInfo jsi;
+
+ bool prelude_complete = false;
+ CodePtr code_begin = nullptr;
+
+#ifdef _WIN32
+ size_t committed_size = 0;
+#endif
+
+ ConstantPool constant_pool;
+
+ using RunCodeFuncType = HaltReason (*)(void*, CodePtr);
+ RunCodeFuncType run_code = nullptr;
+ RunCodeFuncType step_code = nullptr;
+ static constexpr size_t MXCSR_ALREADY_EXITED = 1 << 0;
+ static constexpr size_t FORCE_RETURN = 1 << 1;
+ std::array<const void*, 4> return_from_run_code;
+ void GenRunCode(std::function<void(BlockOfCode&)> rcp);
+
+ const HostFeature host_features;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp b/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp
new file mode 100644
index 0000000000..1ecd8304f1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp
@@ -0,0 +1,41 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/callback.h"
+
+#include "dynarmic/backend/x64/block_of_code.h"
+
+namespace Dynarmic::Backend::X64 {
+
+Callback::~Callback() = default;
+
+void SimpleCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const {
+ l({code.ABI_PARAM1, code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+ code.CallFunction(fn);
+}
+
+void SimpleCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> l) const {
+ l(code.ABI_PARAM1, {code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+ code.CallFunction(fn);
+}
+
+void ArgCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const {
+ l({code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+ code.mov(code.ABI_PARAM1, arg);
+ code.CallFunction(fn);
+}
+
+void ArgCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> l) const {
+#if defined(WIN32) && !defined(__MINGW64__)
+ l(code.ABI_PARAM2, {code.ABI_PARAM3, code.ABI_PARAM4});
+ code.mov(code.ABI_PARAM1, arg);
+#else
+ l(code.ABI_PARAM1, {code.ABI_PARAM3, code.ABI_PARAM4});
+ code.mov(code.ABI_PARAM2, arg);
+#endif
+ code.CallFunction(fn);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/callback.h b/externals/dynarmic/src/dynarmic/backend/x64/callback.h
new file mode 100644
index 0000000000..716555daed
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/callback.h
@@ -0,0 +1,63 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+#include <xbyak/xbyak.h>
+
+namespace Dynarmic::Backend::X64 {
+
+using RegList = std::vector<Xbyak::Reg64>;
+
+class BlockOfCode;
+
+class Callback {
+public:
+ virtual ~Callback();
+
+ void EmitCall(BlockOfCode& code) const {
+ EmitCall(code, [](RegList) {});
+ }
+
+ virtual void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const = 0;
+ virtual void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const = 0;
+};
+
+class SimpleCallback final : public Callback {
+public:
+ template<typename Function>
+ SimpleCallback(Function fn)
+ : fn(reinterpret_cast<void (*)()>(fn)) {}
+
+ using Callback::EmitCall;
+
+ void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const override;
+ void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const override;
+
+private:
+ void (*fn)();
+};
+
+class ArgCallback final : public Callback {
+public:
+ template<typename Function>
+ ArgCallback(Function fn, u64 arg)
+ : fn(reinterpret_cast<void (*)()>(fn)), arg(arg) {}
+
+ using Callback::EmitCall;
+
+ void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const override;
+ void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const override;
+
+private:
+ void (*fn)();
+ u64 arg;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp
new file mode 100644
index 0000000000..ba003262b3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp
@@ -0,0 +1,38 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/constant_pool.h"
+
+#include <cstring>
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+
+namespace Dynarmic::Backend::X64 {
+
+ConstantPool::ConstantPool(BlockOfCode& code, size_t size)
+ : code(code), insertion_point(0) {
+ code.EnsureMemoryCommitted(align_size + size);
+ code.int3();
+ code.align(align_size);
+ pool = std::span<ConstantT>(
+ reinterpret_cast<ConstantT*>(code.AllocateFromCodeSpace(size)), size / align_size);
+}
+
+Xbyak::Address ConstantPool::GetConstant(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) {
+ const auto constant = ConstantT(lower, upper);
+ auto iter = constant_info.find(constant);
+ if (iter == constant_info.end()) {
+ ASSERT(insertion_point < pool.size());
+ ConstantT& target_constant = pool[insertion_point];
+ target_constant = constant;
+ iter = constant_info.insert({constant, &target_constant}).first;
+ ++insertion_point;
+ }
+ return frame[code.rip + iter->second];
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h
new file mode 100644
index 0000000000..2ab6421539
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h
@@ -0,0 +1,50 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <bit>
+#include <cstddef>
+#include <span>
+#include <utility>
+
+#include <mcl/stdint.hpp>
+#include <tsl/robin_map.h>
+#include <xbyak/xbyak.h>
+
+namespace Dynarmic::Backend::X64 {
+
+class BlockOfCode;
+
+/// ConstantPool allocates a block of memory from BlockOfCode.
+/// It places constants into this block of memory, returning the address
+/// of the memory location where the constant is placed. If the constant
+/// already exists, its memory location is reused.
+class ConstantPool final {
+public:
+ ConstantPool(BlockOfCode& code, size_t size);
+
+ Xbyak::Address GetConstant(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0);
+
+private:
+ static constexpr size_t align_size = 16; // bytes
+
+ using ConstantT = std::pair<u64, u64>;
+ static_assert(sizeof(ConstantT) == align_size);
+
+ struct ConstantHash {
+ std::size_t operator()(const ConstantT& constant) const noexcept {
+ return constant.first ^ std::rotl<u64>(constant.second, 1);
+ }
+ };
+
+ tsl::robin_map<ConstantT, void*, ConstantHash> constant_info;
+
+ BlockOfCode& code;
+ std::span<ConstantT> pool;
+ std::size_t insertion_point;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constants.h b/externals/dynarmic/src/dynarmic/backend/x64/constants.h
new file mode 100644
index 0000000000..f817324142
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/constants.h
@@ -0,0 +1,177 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/rounding_mode.h"
+
+namespace Dynarmic::Backend::X64 {
+
+// Redefinition of _MM_CMP_* constants for use with the 'vcmp' instruction
+namespace Cmp {
+constexpr u8 Equal_OQ = 0; // Equal (Quiet, Ordered).
+constexpr u8 LessThan_OS = 1; // Less (Signaling, Ordered).
+constexpr u8 LessEqual_OS = 2; // Less/Equal (Signaling, Ordered).
+constexpr u8 Unordered_Q = 3; // Unordered (Quiet).
+constexpr u8 NotEqual_UQ = 4; // Not Equal (Quiet, Unordered).
+constexpr u8 NotLessThan_US = 5; // Not Less (Signaling, Unordered).
+constexpr u8 NotLessEqual_US = 6; // Not Less/Equal (Signaling, Unordered).
+constexpr u8 Ordered_Q = 7; // Ordered (Quiet).
+constexpr u8 Equal_UQ = 8; // Equal (Quiet, Unordered).
+constexpr u8 NotGreaterEqual_US = 9; // Not Greater/Equal (Signaling, Unordered).
+constexpr u8 NotGreaterThan_US = 10; // Not Greater (Signaling, Unordered).
+constexpr u8 False_OQ = 11; // False (Quiet, Ordered).
+constexpr u8 NotEqual_OQ = 12; // Not Equal (Quiet, Ordered).
+constexpr u8 GreaterEqual_OS = 13; // Greater/Equal (Signaling, Ordered).
+constexpr u8 GreaterThan_OS = 14; // Greater (Signaling, Ordered).
+constexpr u8 True_UQ = 15; // True (Quiet, Unordered).
+constexpr u8 Equal_OS = 16; // Equal (Signaling, Ordered).
+constexpr u8 LessThan_OQ = 17; // Less (Quiet, Ordered).
+constexpr u8 LessEqual_OQ = 18; // Less/Equal (Quiet, Ordered).
+constexpr u8 Unordered_S = 19; // Unordered (Signaling).
+constexpr u8 NotEqual_US = 20; // Not Equal (Signaling, Unordered).
+constexpr u8 NotLessThan_UQ = 21; // Not Less (Quiet, Unordered).
+constexpr u8 NotLessEqual_UQ = 22; // Not Less/Equal (Quiet, Unordered).
+constexpr u8 Ordered_S = 23; // Ordered (Signaling).
+constexpr u8 Equal_US = 24; // Equal (Signaling, Unordered).
+constexpr u8 NotGreaterEqual_UQ = 25; // Not Greater/Equal (Quiet, Unordered).
+constexpr u8 NotGreaterThan_UQ = 26; // Not Greater (Quiet, Unordered).
+constexpr u8 False_OS = 27; // False (Signaling, Ordered).
+constexpr u8 NotEqual_OS = 28; // Not Equal (Signaling, Ordered).
+constexpr u8 GreaterEqual_OQ = 29; // Greater/Equal (Quiet, Ordered).
+constexpr u8 GreaterThan_OQ = 30; // Greater (Quiet, Ordered).
+constexpr u8 True_US = 31; // True (Signaling, Unordered).
+} // namespace Cmp
+
+// Redefinition of _MM_CMPINT_* constants for use with the 'vpcmp' instruction
+namespace CmpInt {
+constexpr u8 Equal = 0x0;
+constexpr u8 LessThan = 0x1;
+constexpr u8 LessEqual = 0x2;
+constexpr u8 False = 0x3;
+constexpr u8 NotEqual = 0x4;
+constexpr u8 NotLessThan = 0x5;
+constexpr u8 GreaterEqual = 0x5;
+constexpr u8 NotLessEqual = 0x6;
+constexpr u8 GreaterThan = 0x6;
+constexpr u8 True = 0x7;
+} // namespace CmpInt
+
+// Used to generate ternary logic truth-tables for vpternlog
+// Use these to directly refer to terms and perform binary operations upon them
+// and the resulting value will be the ternary lookup table
+// ex:
+// (Tern::a | ~Tern::b) & Tern::c
+// = 0b10100010
+// = 0xa2
+// vpternlog a, b, c, 0xa2
+//
+// ~(Tern::a ^ Tern::b) & Tern::c
+// = 0b10000010
+// = 0x82
+// vpternlog a, b, c, 0x82
+namespace Tern {
+constexpr u8 a = 0b11110000;
+constexpr u8 b = 0b11001100;
+constexpr u8 c = 0b10101010;
+} // namespace Tern
+
+// For use as a bitmask with vfpclass instruction
+namespace FpClass {
+constexpr u8 QNaN = 0b00000001;
+constexpr u8 ZeroPos = 0b00000010;
+constexpr u8 ZeroNeg = 0b00000100;
+constexpr u8 InfPos = 0b00001000;
+constexpr u8 InfNeg = 0b00010000;
+constexpr u8 Denormal = 0b00100000;
+constexpr u8 Negative = 0b01000000; // Negative finite value
+constexpr u8 SNaN = 0b10000000;
+} // namespace FpClass
+
+// Opcodes for use with vfixupimm
+enum class FpFixup : u8 {
+ Dest = 0b0000, // Preserve destination
+ Norm_Src = 0b0001, // Source operand (Denormal as positive-zero)
+ QNaN_Src = 0b0010, // QNaN with sign of source (Denormal as positive-zero)
+ IndefNaN = 0b0011, // Indefinite QNaN (Negative QNaN with no payload on x86)
+ NegInf = 0b0100, // -Infinity
+ PosInf = 0b0101, // +Infinity
+ Inf_Src = 0b0110, // Infinity with sign of source (Denormal as positive-zero)
+ NegZero = 0b0111, // -0.0
+ PosZero = 0b1000, // +0.0
+ NegOne = 0b1001, // -1.0
+ PosOne = 0b1010, // +1.0
+ Half = 0b1011, // 0.5
+ Ninety = 0b1100, // 90.0
+ HalfPi = 0b1101, // PI/2
+ PosMax = 0b1110, // +{FLT_MAX,DBL_MAX}
+ NegMax = 0b1111, // -{FLT_MAX,DBL_MAX}
+};
+
+// Generates 32-bit LUT for vfixupimm instruction
+constexpr u32 FixupLUT(FpFixup src_qnan = FpFixup::Dest,
+ FpFixup src_snan = FpFixup::Dest,
+ FpFixup src_zero = FpFixup::Dest,
+ FpFixup src_posone = FpFixup::Dest,
+ FpFixup src_neginf = FpFixup::Dest,
+ FpFixup src_posinf = FpFixup::Dest,
+ FpFixup src_neg = FpFixup::Dest,
+ FpFixup src_pos = FpFixup::Dest) {
+ u32 fixup_lut = 0;
+ fixup_lut = mcl::bit::set_bits<0, 3, u32>(fixup_lut, static_cast<u32>(src_qnan));
+ fixup_lut = mcl::bit::set_bits<4, 7, u32>(fixup_lut, static_cast<u32>(src_snan));
+ fixup_lut = mcl::bit::set_bits<8, 11, u32>(fixup_lut, static_cast<u32>(src_zero));
+ fixup_lut = mcl::bit::set_bits<12, 15, u32>(fixup_lut, static_cast<u32>(src_posone));
+ fixup_lut = mcl::bit::set_bits<16, 19, u32>(fixup_lut, static_cast<u32>(src_neginf));
+ fixup_lut = mcl::bit::set_bits<20, 23, u32>(fixup_lut, static_cast<u32>(src_posinf));
+ fixup_lut = mcl::bit::set_bits<24, 27, u32>(fixup_lut, static_cast<u32>(src_neg));
+ fixup_lut = mcl::bit::set_bits<28, 31, u32>(fixup_lut, static_cast<u32>(src_pos));
+ return fixup_lut;
+}
+
+// Opcodes for use with vrange* instructions
+enum class FpRangeSelect : u8 {
+ Min = 0b00,
+ Max = 0b01,
+ AbsMin = 0b10, // Smaller absolute value
+ AbsMax = 0b11, // Larger absolute value
+};
+
+enum class FpRangeSign : u8 {
+ A = 0b00, // Copy sign of operand A
+ Preserve = 0b01, // Leave sign as is
+ Positive = 0b10, // Set Positive
+ Negative = 0b11, // Set Negative
+};
+
+// Generates 8-bit immediate LUT for vrange instruction
+constexpr u8 FpRangeLUT(FpRangeSelect range_select, FpRangeSign range_sign) {
+ u8 range_lut = 0;
+ range_lut = mcl::bit::set_bits<0, 1, u8>(range_lut, static_cast<u8>(range_select));
+ range_lut = mcl::bit::set_bits<2, 3, u8>(range_lut, static_cast<u8>(range_sign));
+ return range_lut;
+}
+
+constexpr std::optional<int> ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) {
+ switch (rounding_mode) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ return 0b00;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ return 0b10;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ return 0b01;
+ case FP::RoundingMode::TowardsZero:
+ return 0b11;
+ default:
+ return std::nullopt;
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h b/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h
new file mode 100644
index 0000000000..778536a3bb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h
@@ -0,0 +1,81 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstring>
+#include <utility>
+
+#include <mcl/bit_cast.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/function_info.hpp>
+
+#include "dynarmic/backend/x64/callback.h"
+
+namespace Dynarmic {
+namespace Backend::X64 {
+
+namespace impl {
+
+template<typename FunctionType, FunctionType mfp>
+struct ThunkBuilder;
+
+template<typename C, typename R, typename... Args, R (C::*mfp)(Args...)>
+struct ThunkBuilder<R (C::*)(Args...), mfp> {
+ static R Thunk(C* this_, Args... args) {
+ return (this_->*mfp)(std::forward<Args>(args)...);
+ }
+};
+
+} // namespace impl
+
+template<auto mfp>
+ArgCallback DevirtualizeGeneric(mcl::class_type<decltype(mfp)>* this_) {
+ return ArgCallback{&impl::ThunkBuilder<decltype(mfp), mfp>::Thunk, reinterpret_cast<u64>(this_)};
+}
+
+template<auto mfp>
+ArgCallback DevirtualizeWindows(mcl::class_type<decltype(mfp)>* this_) {
+ static_assert(sizeof(mfp) == 8);
+ return ArgCallback{mcl::bit_cast<u64>(mfp), reinterpret_cast<u64>(this_)};
+}
+
+template<auto mfp>
+ArgCallback DevirtualizeItanium(mcl::class_type<decltype(mfp)>* this_) {
+ struct MemberFunctionPointer {
+ /// For a non-virtual function, this is a simple function pointer.
+ /// For a virtual function, it is (1 + virtual table offset in bytes).
+ u64 ptr;
+ /// The required adjustment to `this`, prior to the call.
+ u64 adj;
+ } mfp_struct = mcl::bit_cast<MemberFunctionPointer>(mfp);
+
+ static_assert(sizeof(MemberFunctionPointer) == 16);
+ static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp));
+
+ u64 fn_ptr = mfp_struct.ptr;
+ u64 this_ptr = reinterpret_cast<u64>(this_) + mfp_struct.adj;
+ if (mfp_struct.ptr & 1) {
+ u64 vtable = mcl::bit_cast_pointee<u64>(this_ptr);
+ fn_ptr = mcl::bit_cast_pointee<u64>(vtable + fn_ptr - 1);
+ }
+ return ArgCallback{fn_ptr, this_ptr};
+}
+
+template<auto mfp>
+ArgCallback Devirtualize(mcl::class_type<decltype(mfp)>* this_) {
+#if defined(__APPLE__) || defined(linux) || defined(__linux) || defined(__linux__)
+ return DevirtualizeItanium<mfp>(this_);
+#elif defined(__MINGW64__)
+ return DevirtualizeItanium<mfp>(this_);
+#elif defined(_WIN32)
+ return DevirtualizeWindows<mfp>(this_);
+#else
+ return DevirtualizeGeneric<mfp>(this_);
+#endif
+}
+
+} // namespace Backend::X64
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
new file mode 100644
index 0000000000..902ddef9a7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
@@ -0,0 +1,415 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/emit_x64.h"
+
+#include <iterator>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/scope_exit.hpp>
+#include <mcl/stdint.hpp>
+#include <tsl/robin_set.h>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/nzcv_util.h"
+#include "dynarmic/backend/x64/perf_map.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+#include "dynarmic/backend/x64/verbose_debugging_output.h"
+#include "dynarmic/common/variant_util.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
+// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
+ : reg_alloc(reg_alloc), block(block) {}
+
+EmitContext::~EmitContext() = default;
+
+void EmitContext::EraseInstruction(IR::Inst* inst) {
+ block.Instructions().erase(inst);
+ inst->ClearArgs();
+}
+
+EmitX64::EmitX64(BlockOfCode& code)
+ : code(code) {
+ exception_handler.Register(code);
+}
+
+EmitX64::~EmitX64() = default;
+
+std::optional<EmitX64::BlockDescriptor> EmitX64::GetBasicBlock(IR::LocationDescriptor descriptor) const {
+ const auto iter = block_descriptors.find(descriptor);
+ if (iter == block_descriptors.end()) {
+ return std::nullopt;
+ }
+ return iter->second;
+}
+
+void EmitX64::EmitVoid(EmitContext&, IR::Inst*) {
+}
+
+void EmitX64::EmitIdentity(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (!args[0].IsImmediate()) {
+ ctx.reg_alloc.DefineValue(inst, args[0]);
+ }
+}
+
+void EmitX64::EmitBreakpoint(EmitContext&, IR::Inst*) {
+ code.int3();
+}
+
+void EmitX64::EmitCallHostFunction(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(nullptr, args[1], args[2], args[3]);
+ code.mov(rax, args[0].GetImmediateU64());
+ code.call(rax);
+}
+
+void EmitX64::PushRSBHelper(Xbyak::Reg64 loc_desc_reg, Xbyak::Reg64 index_reg, IR::LocationDescriptor target) {
+ using namespace Xbyak::util;
+
+ const auto iter = block_descriptors.find(target);
+ CodePtr target_code_ptr = iter != block_descriptors.end()
+ ? iter->second.entrypoint
+ : code.GetReturnFromRunCodeAddress();
+
+ code.mov(index_reg.cvt32(), dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr]);
+
+ code.mov(loc_desc_reg, target.Value());
+
+ patch_information[target].mov_rcx.push_back(code.getCurr());
+ EmitPatchMovRcx(target_code_ptr);
+
+ code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_location_descriptors], loc_desc_reg);
+ code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_codeptrs], rcx);
+
+ code.add(index_reg.cvt32(), 1);
+ code.and_(index_reg.cvt32(), u32(code.GetJitStateInfo().rsb_ptr_mask));
+ code.mov(dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr], index_reg.cvt32());
+}
+
+void EmitX64::EmitVerboseDebuggingOutput(RegAlloc& reg_alloc) {
+ code.sub(rsp, sizeof(RegisterData));
+ code.stmxcsr(dword[rsp + offsetof(RegisterData, mxcsr)]);
+ for (int i = 0; i < 16; i++) {
+ if (rsp.getIdx() == i) {
+ continue;
+ }
+ code.mov(qword[rsp + offsetof(RegisterData, gprs) + sizeof(u64) * i], Xbyak::Reg64{i});
+ }
+ for (int i = 0; i < 16; i++) {
+ code.movaps(xword[rsp + offsetof(RegisterData, xmms) + 2 * sizeof(u64) * i], Xbyak::Xmm{i});
+ }
+ code.lea(rax, ptr[rsp + sizeof(RegisterData) + offsetof(StackLayout, spill)]);
+ code.mov(xword[rsp + offsetof(RegisterData, spill)], rax);
+
+ reg_alloc.EmitVerboseDebuggingOutput();
+
+ for (int i = 0; i < 16; i++) {
+ if (rsp.getIdx() == i) {
+ continue;
+ }
+ code.mov(Xbyak::Reg64{i}, qword[rsp + offsetof(RegisterData, gprs) + sizeof(u64) * i]);
+ }
+ for (int i = 0; i < 16; i++) {
+ code.movaps(Xbyak::Xmm{i}, xword[rsp + offsetof(RegisterData, xmms) + 2 * sizeof(u64) * i]);
+ }
+ code.ldmxcsr(dword[rsp + offsetof(RegisterData, mxcsr)]);
+ code.add(rsp, sizeof(RegisterData));
+}
+
+void EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[0].IsImmediate());
+ const u64 unique_hash_of_target = args[0].GetImmediateU64();
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
+ const Xbyak::Reg64 loc_desc_reg = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 index_reg = ctx.reg_alloc.ScratchGpr();
+
+ PushRSBHelper(loc_desc_reg, index_reg, IR::LocationDescriptor{unique_hash_of_target});
+}
+
+void EmitX64::EmitGetCarryFromOp(EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+}
+
+void EmitX64::EmitGetOverflowFromOp(EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+}
+
+void EmitX64::EmitGetGEFromOp(EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+}
+
+void EmitX64::EmitGetUpperFromOp(EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+}
+
+void EmitX64::EmitGetLowerFromOp(EmitContext& ctx, IR::Inst* inst) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+}
+
+void EmitX64::EmitGetNZFromOp(EmitContext& ctx, IR::Inst* inst) {
+ if (ctx.reg_alloc.IsValueLive(inst)) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const int bitsize = [&] {
+ switch (args[0].GetType()) {
+ case IR::Type::U8:
+ return 8;
+ case IR::Type::U16:
+ return 16;
+ case IR::Type::U32:
+ return 32;
+ case IR::Type::U64:
+ return 64;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ const Xbyak::Reg64 nz = ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
+ code.test(value, value);
+ code.lahf();
+ code.movzx(eax, ah);
+ ctx.reg_alloc.DefineValue(inst, nz);
+}
+
+void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) {
+ if (ctx.reg_alloc.IsValueLive(inst)) {
+ ctx.reg_alloc.RegisterPseudoOperation(inst);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const int bitsize = [&] {
+ switch (args[0].GetType()) {
+ case IR::Type::U8:
+ return 8;
+ case IR::Type::U16:
+ return 16;
+ case IR::Type::U32:
+ return 32;
+ case IR::Type::U64:
+ return 64;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
+ code.test(value, value);
+ code.lahf();
+ code.mov(al, 0);
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitX64::EmitGetCFlagFromNZCV(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsImmediate()) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const u32 value = (args[0].GetImmediateU32() >> 8) & 1;
+ code.mov(result, value);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ code.shr(result, 8);
+ code.and_(result, 1);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsImmediate()) {
+ const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr().cvt32();
+ u32 value = 0;
+ value |= mcl::bit::get_bit<31>(args[0].GetImmediateU32()) ? (1 << 15) : 0;
+ value |= mcl::bit::get_bit<30>(args[0].GetImmediateU32()) ? (1 << 14) : 0;
+ value |= mcl::bit::get_bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0;
+ value |= mcl::bit::get_bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
+ code.mov(nzcv, value);
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+ } else if (code.HasHostFeature(HostFeature::FastBMI2)) {
+ const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.shr(nzcv, 28);
+ code.mov(tmp, NZCV::x64_mask);
+ code.pdep(nzcv, nzcv, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+ } else {
+ const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ code.shr(nzcv, 28);
+ code.imul(nzcv, nzcv, NZCV::to_x64_multiplier);
+ code.and_(nzcv, NZCV::x64_mask);
+
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+ }
+}
+
+void EmitX64::EmitAddCycles(size_t cycles) {
+ ASSERT(cycles < std::numeric_limits<s32>::max());
+ code.sub(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], static_cast<u32>(cycles));
+}
+
+Xbyak::Label EmitX64::EmitCond(IR::Cond cond) {
+ Xbyak::Label pass;
+
+ code.mov(eax, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
+
+ code.LoadRequiredFlagsForCondFromRax(cond);
+
+ switch (cond) {
+ case IR::Cond::EQ:
+ code.jz(pass);
+ break;
+ case IR::Cond::NE:
+ code.jnz(pass);
+ break;
+ case IR::Cond::CS:
+ code.jc(pass);
+ break;
+ case IR::Cond::CC:
+ code.jnc(pass);
+ break;
+ case IR::Cond::MI:
+ code.js(pass);
+ break;
+ case IR::Cond::PL:
+ code.jns(pass);
+ break;
+ case IR::Cond::VS:
+ code.jo(pass);
+ break;
+ case IR::Cond::VC:
+ code.jno(pass);
+ break;
+ case IR::Cond::HI:
+ code.ja(pass);
+ break;
+ case IR::Cond::LS:
+ code.jna(pass);
+ break;
+ case IR::Cond::GE:
+ code.jge(pass);
+ break;
+ case IR::Cond::LT:
+ code.jl(pass);
+ break;
+ case IR::Cond::GT:
+ code.jg(pass);
+ break;
+ case IR::Cond::LE:
+ code.jle(pass);
+ break;
+ default:
+ ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond));
+ break;
+ }
+
+ return pass;
+}
+
+EmitX64::BlockDescriptor EmitX64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) {
+ PerfMapRegister(entrypoint, code.getCurr(), LocationDescriptorToFriendlyName(descriptor));
+ Patch(descriptor, entrypoint);
+
+ BlockDescriptor block_desc{entrypoint, size};
+ block_descriptors.insert({IR::LocationDescriptor{descriptor.Value()}, block_desc});
+ return block_desc;
+}
+
+void EmitX64::EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+ Common::VisitVariant<void>(terminal, [this, initial_location, is_single_step](auto x) {
+ using T = std::decay_t<decltype(x)>;
+ if constexpr (!std::is_same_v<T, IR::Term::Invalid>) {
+ this->EmitTerminalImpl(x, initial_location, is_single_step);
+ } else {
+ ASSERT_MSG(false, "Invalid terminal");
+ }
+ });
+}
+
+void EmitX64::Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+ const CodePtr save_code_ptr = code.getCurr();
+ const PatchInformation& patch_info = patch_information[target_desc];
+
+ for (CodePtr location : patch_info.jg) {
+ code.SetCodePtr(location);
+ EmitPatchJg(target_desc, target_code_ptr);
+ }
+
+ for (CodePtr location : patch_info.jz) {
+ code.SetCodePtr(location);
+ EmitPatchJz(target_desc, target_code_ptr);
+ }
+
+ for (CodePtr location : patch_info.jmp) {
+ code.SetCodePtr(location);
+ EmitPatchJmp(target_desc, target_code_ptr);
+ }
+
+ for (CodePtr location : patch_info.mov_rcx) {
+ code.SetCodePtr(location);
+ EmitPatchMovRcx(target_code_ptr);
+ }
+
+ code.SetCodePtr(save_code_ptr);
+}
+
+void EmitX64::Unpatch(const IR::LocationDescriptor& target_desc) {
+ if (patch_information.count(target_desc)) {
+ Patch(target_desc, nullptr);
+ }
+}
+
+void EmitX64::ClearCache() {
+ block_descriptors.clear();
+ patch_information.clear();
+
+ PerfMapClear();
+}
+
+void EmitX64::InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& locations) {
+ code.EnableWriting();
+ SCOPE_EXIT {
+ code.DisableWriting();
+ };
+
+ for (const auto& descriptor : locations) {
+ const auto it = block_descriptors.find(descriptor);
+ if (it == block_descriptors.end()) {
+ continue;
+ }
+
+ Unpatch(descriptor);
+
+ block_descriptors.erase(it);
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h
new file mode 100644
index 0000000000..5a46f04ee6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h
@@ -0,0 +1,145 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <mcl/bitsizeof.hpp>
+#include <tsl/robin_map.h>
+#include <tsl/robin_set.h>
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>
+
+#include "dynarmic/backend/exception_handler.h"
+#include "dynarmic/backend/x64/reg_alloc.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/ir/location_descriptor.h"
+#include "dynarmic/ir/terminal.h"
+
+namespace Dynarmic::IR {
+class Block;
+class Inst;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic {
+enum class OptimizationFlag : u32;
+} // namespace Dynarmic
+
+namespace Dynarmic::Backend::X64 {
+
+class BlockOfCode;
+
+using A64FullVectorWidth = std::integral_constant<size_t, 128>;
+
+// Array alias that always sizes itself according to the given type T
+// relative to the size of a vector register. e.g. T = u32 would result
+// in a std::array<u32, 4>.
+template<typename T>
+using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>;
+
+template<typename T>
+using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T> / 2>;
+
+struct EmitContext {
+ EmitContext(RegAlloc& reg_alloc, IR::Block& block);
+ virtual ~EmitContext();
+
+ void EraseInstruction(IR::Inst* inst);
+
+ virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
+
+ virtual bool HasOptimization(OptimizationFlag flag) const = 0;
+
+ RegAlloc& reg_alloc;
+ IR::Block& block;
+
+ std::vector<std::function<void()>> deferred_emits;
+};
+
+using SharedLabel = std::shared_ptr<Xbyak::Label>;
+
+inline SharedLabel GenSharedLabel() {
+ return std::make_shared<Xbyak::Label>();
+}
+
+class EmitX64 {
+public:
+ struct BlockDescriptor {
+ CodePtr entrypoint; // Entrypoint of emitted code
+ size_t size; // Length in bytes of emitted code
+ };
+
+ explicit EmitX64(BlockOfCode& code);
+ virtual ~EmitX64();
+
+ /// Looks up an emitted host block in the cache.
+ std::optional<BlockDescriptor> GetBasicBlock(IR::LocationDescriptor descriptor) const;
+
+ /// Empties the entire cache.
+ virtual void ClearCache();
+
+ /// Invalidates a selection of basic blocks.
+ void InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& locations);
+
+protected:
+ // Microinstruction emitters
+#define OPCODE(name, type, ...) void Emit##name(EmitContext& ctx, IR::Inst* inst);
+#define A32OPC(...)
+#define A64OPC(...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+ // Helpers
+ virtual std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const = 0;
+ void EmitAddCycles(size_t cycles);
+ Xbyak::Label EmitCond(IR::Cond cond);
+ BlockDescriptor RegisterBlock(const IR::LocationDescriptor& location_descriptor, CodePtr entrypoint, size_t size);
+ void PushRSBHelper(Xbyak::Reg64 loc_desc_reg, Xbyak::Reg64 index_reg, IR::LocationDescriptor target);
+
+ void EmitVerboseDebuggingOutput(RegAlloc& reg_alloc);
+
+ // Terminal instruction emitters
+ void EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step);
+ virtual void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+ virtual void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+
+ // Patching
+ struct PatchInformation {
+ std::vector<CodePtr> jg;
+ std::vector<CodePtr> jz;
+ std::vector<CodePtr> jmp;
+ std::vector<CodePtr> mov_rcx;
+ };
+ void Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr);
+ virtual void Unpatch(const IR::LocationDescriptor& target_desc);
+ virtual void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0;
+ virtual void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0;
+ virtual void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0;
+ virtual void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) = 0;
+
+ // State
+ BlockOfCode& code;
+ ExceptionHandler exception_handler;
+ tsl::robin_map<IR::LocationDescriptor, BlockDescriptor> block_descriptors;
+ tsl::robin_map<IR::LocationDescriptor, PatchInformation> patch_information;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp
new file mode 100644
index 0000000000..9430f0726b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp
@@ -0,0 +1,109 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/crypto/aes.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+namespace AES = Common::Crypto::AES;
+
+using AESFn = void(AES::State&, const AES::State&);
+
+static void EmitAESFunction(RegAlloc::ArgumentInfo args, EmitContext& ctx, BlockOfCode& code, IR::Inst* inst, AESFn fn) {
+ constexpr u32 stack_space = static_cast<u32>(sizeof(AES::State)) * 2;
+ const Xbyak::Xmm input = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + sizeof(AES::State)]);
+ code.movaps(xword[code.ABI_PARAM2], input);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAESDecryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AES)) {
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zero, zero);
+ code.aesdeclast(data, zero);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitAESFunction(args, ctx, code, inst, AES::DecryptSingleRound);
+}
+
+void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AES)) {
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zero, zero);
+ code.aesenclast(data, zero);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitAESFunction(args, ctx, code, inst, AES::EncryptSingleRound);
+}
+
+void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AES)) {
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.aesimc(data, data);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitAESFunction(args, ctx, code, inst, AES::InverseMixColumns);
+}
+
+void EmitX64::EmitAESMixColumns(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AES)) {
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zero, zero);
+ code.aesdeclast(data, zero);
+ code.aesenc(data, zero);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitAESFunction(args, ctx, code, inst, AES::MixColumns);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp
new file mode 100644
index 0000000000..842a8612ee
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp
@@ -0,0 +1,154 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+#include <climits>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/crypto/crc32.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+namespace CRC32 = Common::Crypto::CRC32;
+
+static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size);
+
+ if (data_size != 64) {
+ code.crc32(crc, value);
+ } else {
+ code.crc32(crc.cvt64(), value);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, crc);
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
+ code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
+ code.CallFunction(&CRC32::ComputeCRC32Castagnoli);
+}
+
+static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size < 32) {
+ const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]);
+ const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
+
+ code.movzx(value.cvt32(), value.changeBit(data_size));
+ code.xor_(value.cvt32(), crc);
+ code.movd(xmm_tmp, value.cvt32());
+ code.pslldq(xmm_tmp, (64 - data_size) / 8);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpclmulqdq(xmm_value, xmm_tmp, xmm_const, 0x00);
+ code.pclmulqdq(xmm_value, xmm_const, 0x10);
+ code.pxor(xmm_value, xmm_tmp);
+ } else {
+ code.movdqa(xmm_value, xmm_tmp);
+ code.pclmulqdq(xmm_value, xmm_const, 0x00);
+ code.pclmulqdq(xmm_value, xmm_const, 0x10);
+ code.pxor(xmm_value, xmm_tmp);
+ }
+
+ code.pextrd(crc, xmm_value, 2);
+
+ ctx.reg_alloc.DefineValue(inst, crc);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
+ const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
+
+ code.xor_(crc, value);
+ code.shl(crc.cvt64(), 32);
+ code.movq(xmm_value, crc.cvt64());
+
+ code.pclmulqdq(xmm_value, xmm_const, 0x00);
+ code.pclmulqdq(xmm_value, xmm_const, 0x10);
+
+ code.pextrd(crc, xmm_value, 2);
+
+ ctx.reg_alloc.DefineValue(inst, crc);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
+ const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
+ const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641));
+
+ code.mov(crc, crc);
+ code.xor_(crc.cvt64(), value);
+ code.movq(xmm_value, crc.cvt64());
+
+ code.pclmulqdq(xmm_value, xmm_const, 0x00);
+ code.pclmulqdq(xmm_value, xmm_const, 0x10);
+
+ code.pextrd(crc, xmm_value, 2);
+
+ ctx.reg_alloc.DefineValue(inst, crc);
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
+ code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
+ code.CallFunction(&CRC32::ComputeCRC32ISO);
+}
+
+void EmitX64::EmitCRC32Castagnoli8(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32Castagnoli(code, ctx, inst, 8);
+}
+
+void EmitX64::EmitCRC32Castagnoli16(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32Castagnoli(code, ctx, inst, 16);
+}
+
+void EmitX64::EmitCRC32Castagnoli32(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32Castagnoli(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitCRC32Castagnoli64(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32Castagnoli(code, ctx, inst, 64);
+}
+
+void EmitX64::EmitCRC32ISO8(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32ISO(code, ctx, inst, 8);
+}
+
+void EmitX64::EmitCRC32ISO16(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32ISO(code, ctx, inst, 16);
+}
+
+void EmitX64::EmitCRC32ISO32(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32ISO(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitCRC32ISO64(EmitContext& ctx, IR::Inst* inst) {
+ EmitCRC32ISO(code, ctx, inst, 64);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp
new file mode 100644
index 0000000000..98197c2db3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp
@@ -0,0 +1,1701 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstddef>
+#include <type_traits>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void EmitX64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 lo = ctx.reg_alloc.UseScratchGpr(args[0]);
+ const Xbyak::Reg64 hi = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+ code.shl(hi, 32);
+ code.mov(lo.cvt32(), lo.cvt32()); // Zero extend to 64-bits
+ code.or_(lo, hi);
+
+ ctx.reg_alloc.DefineValue(inst, lo);
+}
+
+void EmitX64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 lo = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(result, lo);
+ code.pinsrq(result, hi, 1);
+ } else {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.movq(result, lo);
+ code.movq(tmp, hi);
+ code.punpcklqdq(result, tmp);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitLeastSignificantWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ // TODO: DefineValue directly on Argument
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+ code.mov(result.cvt32(), source.cvt32());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitMostSignificantWord(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.shr(result, 32);
+
+ if (carry_inst) {
+ const Xbyak::Reg64 carry = ctx.reg_alloc.ScratchGpr();
+ code.setc(carry.cvt8());
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitLeastSignificantHalf(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ // TODO: DefineValue directly on Argument
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+ code.movzx(result.cvt32(), source.cvt16());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitLeastSignificantByte(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ // TODO: DefineValue directly on Argument
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+ code.movzx(result.cvt32(), source.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitMostSignificantBit(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ // TODO: Flag optimization
+ code.shr(result, 31);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitIsZero32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ // TODO: Flag optimization
+ code.test(result, result);
+ code.sete(result.cvt8());
+ code.movzx(result, result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitIsZero64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ // TODO: Flag optimization
+ code.test(result, result);
+ code.sete(result.cvt8());
+ code.movzx(result, result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitTestBit(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ ASSERT(args[1].IsImmediate());
+ // TODO: Flag optimization
+ code.bt(result, args[1].GetImmediateU8());
+ code.setc(result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr(HostLoc::RAX).cvt32();
+ const Xbyak::Reg then_ = ctx.reg_alloc.UseGpr(args[1]).changeBit(bitsize);
+ const Xbyak::Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]).changeBit(bitsize);
+
+ code.mov(nzcv, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]);
+
+ code.LoadRequiredFlagsForCondFromRax(args[0].GetImmediateCond());
+
+ switch (args[0].GetImmediateCond()) {
+ case IR::Cond::EQ:
+ code.cmovz(else_, then_);
+ break;
+ case IR::Cond::NE:
+ code.cmovnz(else_, then_);
+ break;
+ case IR::Cond::CS:
+ code.cmovc(else_, then_);
+ break;
+ case IR::Cond::CC:
+ code.cmovnc(else_, then_);
+ break;
+ case IR::Cond::MI:
+ code.cmovs(else_, then_);
+ break;
+ case IR::Cond::PL:
+ code.cmovns(else_, then_);
+ break;
+ case IR::Cond::VS:
+ code.cmovo(else_, then_);
+ break;
+ case IR::Cond::VC:
+ code.cmovno(else_, then_);
+ break;
+ case IR::Cond::HI:
+ code.cmova(else_, then_);
+ break;
+ case IR::Cond::LS:
+ code.cmovna(else_, then_);
+ break;
+ case IR::Cond::GE:
+ code.cmovge(else_, then_);
+ break;
+ case IR::Cond::LT:
+ code.cmovl(else_, then_);
+ break;
+ case IR::Cond::GT:
+ code.cmovg(else_, then_);
+ break;
+ case IR::Cond::LE:
+ code.cmovle(else_, then_);
+ break;
+ case IR::Cond::AL:
+ case IR::Cond::NV:
+ code.mov(else_, then_);
+ break;
+ default:
+ ASSERT_MSG(false, "Invalid cond {}", static_cast<size_t>(args[0].GetImmediateCond()));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, else_);
+}
+
+void EmitX64::EmitConditionalSelect32(EmitContext& ctx, IR::Inst* inst) {
+ EmitConditionalSelect(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitConditionalSelect64(EmitContext& ctx, IR::Inst* inst) {
+ EmitConditionalSelect(code, ctx, inst, 64);
+}
+
+void EmitX64::EmitConditionalSelectNZCV(EmitContext& ctx, IR::Inst* inst) {
+ EmitConditionalSelect(code, ctx, inst, 32);
+}
+
+static void EmitExtractRegister(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit_size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bit_size);
+ const Xbyak::Reg operand = ctx.reg_alloc.UseGpr(args[1]).changeBit(bit_size);
+ const u8 lsb = args[2].GetImmediateU8();
+
+ code.shrd(result, operand, lsb);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitExtractRegister32(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) {
+ EmitExtractRegister(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitExtractRegister64(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) {
+ EmitExtractRegister(code, ctx, inst, 64);
+}
+
+static void EmitReplicateBit(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit_size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const u8 bit = args[1].GetImmediateU8();
+
+ if (bit == bit_size - 1) {
+ const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bit_size);
+
+ code.sar(result, bit_size - 1);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bit_size);
+ const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bit_size);
+
+ code.xor_(result, result);
+ code.bt(value, bit);
+ code.sbb(result, result);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitReplicateBit32(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) {
+ EmitReplicateBit(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitReplicateBit64(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) {
+ EmitReplicateBit(code, ctx, inst, 64);
+}
+
+void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift <= 31) {
+ code.shl(result, shift);
+ } else {
+ code.xor_(result, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.shlx(result, operand, shift);
+ code.xor_(zero, zero);
+ code.cmp(shift.cvt8(), 32);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // The 32-bit x64 SHL instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
+
+ code.shl(result, code.cl);
+ code.xor_(zero, zero);
+ code.cmp(code.cl, 32);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ } else {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32();
+
+ if (shift == 0) {
+ // There is nothing more to do.
+ } else if (shift < 32) {
+ code.bt(carry.cvt32(), 0);
+ code.shl(result, shift);
+ code.setc(carry.cvt8());
+ } else if (shift > 32) {
+ code.xor_(result, result);
+ code.xor_(carry, carry);
+ } else {
+ code.mov(carry, result);
+ code.xor_(result, result);
+ code.and_(carry, 1);
+ }
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32();
+
+ code.mov(tmp, 63);
+ code.cmp(code.cl, 63);
+ code.cmova(code.ecx, tmp);
+ code.shl(result.cvt64(), 32);
+ code.bt(carry.cvt32(), 0);
+ code.shl(result.cvt64(), code.cl);
+ code.setc(carry.cvt8());
+ code.shr(result.cvt64(), 32);
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ }
+}
+
+void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift < 64) {
+ code.shl(result, shift);
+ } else {
+ code.xor_(result.cvt32(), result.cvt32());
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
+ const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
+
+ code.shlx(result, operand, shift);
+ code.xor_(zero.cvt32(), zero.cvt32());
+ code.cmp(shift.cvt8(), 64);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
+
+ // The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
+
+ code.shl(result, code.cl);
+ code.xor_(zero.cvt32(), zero.cvt32());
+ code.cmp(code.cl, 64);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift <= 31) {
+ code.shr(result, shift);
+ } else {
+ code.xor_(result, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.shrx(result, operand, shift);
+ code.xor_(zero, zero);
+ code.cmp(shift.cvt8(), 32);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
+
+ code.shr(result, code.cl);
+ code.xor_(zero, zero);
+ code.cmp(code.cl, 32);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ } else {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32();
+
+ if (shift == 0) {
+ // There is nothing more to do.
+ } else if (shift < 32) {
+ code.shr(result, shift);
+ code.setc(carry.cvt8());
+ } else if (shift == 32) {
+ code.bt(result, 31);
+ code.setc(carry.cvt8());
+ code.mov(result, 0);
+ } else {
+ code.xor_(result, result);
+ code.xor_(carry, carry);
+ }
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32();
+
+ code.mov(result, 63);
+ code.cmp(code.cl, 63);
+ code.cmovnb(code.ecx, result);
+ code.mov(result, operand);
+ code.bt(carry.cvt32(), 0);
+ code.shr(result.cvt64(), code.cl);
+ code.setc(carry.cvt8());
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ }
+}
+
+void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const u8 shift = shift_arg.GetImmediateU8();
+
+ if (shift < 64) {
+ code.shr(result, shift);
+ } else {
+ code.xor_(result.cvt32(), result.cvt32());
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
+ const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
+
+ code.shrx(result, operand, shift);
+ code.xor_(zero.cvt32(), zero.cvt32());
+ code.cmp(shift.cvt8(), 63);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
+
+ // The x64 SHR instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
+
+ code.shr(result, code.cl);
+ code.xor_(zero.cvt32(), zero.cvt32());
+ code.cmp(code.cl, 64);
+ code.cmovnb(result, zero);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+
+ code.sar(result, u8(shift < 31 ? shift : 31));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32();
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count.
+
+ // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
+ code.mov(const31, 31);
+ code.cmp(shift.cvt8(), 31);
+ code.cmovnb(shift, const31);
+ code.sarx(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count.
+
+ // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
+ code.mov(const31, 31);
+ code.cmp(code.cl, u32(31));
+ code.cmova(code.ecx, const31);
+ code.sar(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ } else {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8();
+
+ if (shift == 0) {
+ // There is nothing more to do.
+ } else if (shift <= 31) {
+ code.sar(result, shift);
+ code.setc(carry);
+ } else {
+ code.sar(result, 31);
+ code.bt(result, 31);
+ code.setc(carry);
+ }
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32();
+
+ code.mov(result, 63);
+ code.cmp(code.cl, 63);
+ code.cmovnb(code.ecx, result);
+ code.movsxd(result.cvt64(), operand);
+ code.bt(carry.cvt32(), 0);
+ code.sar(result.cvt64(), code.cl);
+ code.setc(carry.cvt8());
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ }
+}
+
+void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+ code.sar(result, u8(shift < 63 ? shift : 63));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
+ const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
+
+ code.mov(const63.cvt32(), 63);
+ code.cmp(shift.cvt8(), 63);
+ code.cmovnb(shift, const63);
+ code.sarx(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
+
+ // The 64-bit x64 SAR instruction masks the shift count by 0x3F before performing the shift.
+ // ARM differs from the behaviour: It does not mask the count.
+
+ // We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
+ code.mov(const63, 63);
+ code.cmp(code.cl, u32(63));
+ code.cmovnb(code.ecx, const63);
+ code.sar(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+ auto& carry_arg = args[2];
+
+ if (!carry_inst) {
+ if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.rorx(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+
+ code.ror(result, u8(shift & 0x1F));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+
+ // x64 ROR instruction does (shift & 0x1F) for us.
+ code.ror(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ } else {
+ if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8();
+
+ if (shift == 0) {
+ // There is nothing more to do.
+ } else if ((shift & 0x1F) == 0) {
+ code.bt(result, u8(31));
+ code.setc(carry);
+ } else {
+ code.ror(result, shift);
+ code.setc(carry);
+ }
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8();
+
+ Xbyak::Label end;
+
+ code.test(code.cl, code.cl);
+ code.jz(end);
+
+ code.ror(result, code.cl);
+ code.bt(result, u8(31));
+ code.setc(carry);
+
+ code.L(end);
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ }
+}
+
+void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+
+ code.rorx(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else if (shift_arg.IsImmediate()) {
+ const u8 shift = shift_arg.GetImmediateU8();
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+ code.ror(result, u8(shift & 0x3F));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+ // x64 ROR instruction does (shift & 0x3F) for us.
+ code.ror(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(args[1]).cvt8();
+
+ code.bt(carry.cvt32(), 0);
+ code.rcr(result, 1);
+
+ if (carry_inst) {
+ code.setc(carry);
+
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename ShfitFT, typename BMI2FT>
+static void EmitMaskedShift32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, ShfitFT shift_fn, [[maybe_unused]] BMI2FT bmi2_shift) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+ const u32 shift = shift_arg.GetImmediateU32();
+
+ shift_fn(result, static_cast<int>(shift & 0x1F));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
+ if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
+ const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
+
+ (code.*bmi2_shift)(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
+
+ shift_fn(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename ShfitFT, typename BMI2FT>
+static void EmitMaskedShift64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, ShfitFT shift_fn, [[maybe_unused]] BMI2FT bmi2_shift) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& operand_arg = args[0];
+ auto& shift_arg = args[1];
+
+ if (shift_arg.IsImmediate()) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+ const u64 shift = shift_arg.GetImmediateU64();
+
+ shift_fn(result, static_cast<int>(shift & 0x3F));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) {
+ if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
+ const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
+
+ (code.*bmi2_shift)(result, operand, shift);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+ shift_fn(result, code.cl);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitLogicalShiftLeftMasked32(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst, [&](auto result, auto shift) { code.shl(result, shift); }, &Xbyak::CodeGenerator::shlx);
+}
+
+void EmitX64::EmitLogicalShiftLeftMasked64(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst, [&](auto result, auto shift) { code.shl(result, shift); }, &Xbyak::CodeGenerator::shlx);
+}
+
+void EmitX64::EmitLogicalShiftRightMasked32(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst, [&](auto result, auto shift) { code.shr(result, shift); }, &Xbyak::CodeGenerator::shrx);
+}
+
+void EmitX64::EmitLogicalShiftRightMasked64(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst, [&](auto result, auto shift) { code.shr(result, shift); }, &Xbyak::CodeGenerator::shrx);
+}
+
+void EmitX64::EmitArithmeticShiftRightMasked32(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst, [&](auto result, auto shift) { code.sar(result, shift); }, &Xbyak::CodeGenerator::sarx);
+}
+
+void EmitX64::EmitArithmeticShiftRightMasked64(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst, [&](auto result, auto shift) { code.sar(result, shift); }, &Xbyak::CodeGenerator::sarx);
+}
+
+void EmitX64::EmitRotateRightMasked32(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift32(
+ code, ctx, inst, [&](auto result, auto shift) { code.ror(result, shift); }, nullptr);
+}
+
+void EmitX64::EmitRotateRightMasked64(EmitContext& ctx, IR::Inst* inst) {
+ EmitMaskedShift64(
+ code, ctx, inst, [&](auto result, auto shift) { code.ror(result, shift); }, nullptr);
+}
+
+static Xbyak::Reg8 DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* carry_out) {
+ if (carry_in.IsImmediate()) {
+ return carry_out ? reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};
+ } else {
+ return carry_out ? reg_alloc.UseScratchGpr(carry_in).cvt8() : reg_alloc.UseGpr(carry_in).cvt8();
+ }
+}
+
+static Xbyak::Reg64 DoNZCV(BlockOfCode& code, RegAlloc& reg_alloc, IR::Inst* nzcv_out) {
+ if (!nzcv_out) {
+ return Xbyak::Reg64{-1};
+ }
+
+ const Xbyak::Reg64 nzcv = reg_alloc.ScratchGpr(HostLoc::RAX);
+ code.xor_(nzcv.cvt32(), nzcv.cvt32());
+ return nzcv;
+}
+
+static void EmitAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+ const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& carry_in = args[2];
+
+ // Consider using LEA.
+ if (!carry_inst && !overflow_inst && !nzcv_inst && carry_in.IsImmediate() && !carry_in.GetImmediateU1()) {
+ if (args[1].IsImmediate() && args[1].FitsInImmediateS32()) {
+ const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
+ const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize);
+
+ code.lea(result, code.ptr[op1 + args[1].GetImmediateS32()]);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
+ const Xbyak::Reg op2 = ctx.reg_alloc.UseGpr(args[1]).changeBit(bitsize);
+ const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize);
+
+ code.lea(result, code.ptr[op1 + op2]);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ return;
+ }
+
+ const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst);
+ const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bitsize);
+ const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
+ const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};
+
+ if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
+ const u32 op_arg = args[1].GetImmediateU32();
+ if (carry_in.IsImmediate()) {
+ if (carry_in.GetImmediateU1()) {
+ code.stc();
+ code.adc(result, op_arg);
+ } else {
+ code.add(result, op_arg);
+ }
+ } else {
+ code.bt(carry.cvt32(), 0);
+ code.adc(result, op_arg);
+ }
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(bitsize);
+ if (carry_in.IsImmediate()) {
+ if (carry_in.GetImmediateU1()) {
+ code.stc();
+ code.adc(result, *op_arg);
+ } else {
+ code.add(result, *op_arg);
+ }
+ } else {
+ code.bt(carry.cvt32(), 0);
+ code.adc(result, *op_arg);
+ }
+ }
+
+ if (nzcv_inst) {
+ code.lahf();
+ code.seto(code.al);
+ ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
+ }
+ if (carry_inst) {
+ code.setc(carry);
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ }
+ if (overflow_inst) {
+ code.seto(overflow);
+ ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitAdd(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitAdd(code, ctx, inst, 64);
+}
+
+static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+ const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+ const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto& carry_in = args[2];
+ const bool is_cmp = inst->UseCount() == size_t(!!carry_inst + !!overflow_inst + !!nzcv_inst) && carry_in.IsImmediate() && carry_in.GetImmediateU1();
+
+ // Consider using LEA.
+ if (!carry_inst && !overflow_inst && !nzcv_inst && carry_in.IsImmediate() && carry_in.GetImmediateU1() && args[1].IsImmediate() && args[1].FitsInImmediateS32() && args[1].GetImmediateS32() != 0xffff'ffff'8000'0000) {
+ const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize);
+ const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize);
+
+ code.lea(result, code.ptr[op1 - args[1].GetImmediateS32()]);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst);
+ const Xbyak::Reg result = (is_cmp ? ctx.reg_alloc.UseGpr(args[0]) : ctx.reg_alloc.UseScratchGpr(args[0])).changeBit(bitsize);
+ const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
+ const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};
+
+ // Note that x64 CF is inverse of what the ARM carry flag is here.
+
+ bool invert_output_carry = true;
+
+ if (is_cmp) {
+ if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
+ const u32 op_arg = args[1].GetImmediateU32();
+ code.cmp(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(bitsize);
+ code.cmp(result, *op_arg);
+ }
+ } else if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
+ const u32 op_arg = args[1].GetImmediateU32();
+ if (carry_in.IsImmediate()) {
+ if (carry_in.GetImmediateU1()) {
+ code.sub(result, op_arg);
+ } else {
+ code.add(result, ~op_arg);
+ invert_output_carry = false;
+ }
+ } else {
+ code.bt(carry.cvt32(), 0);
+ code.adc(result, ~op_arg);
+ invert_output_carry = false;
+ }
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(bitsize);
+ if (carry_in.IsImmediate()) {
+ if (carry_in.GetImmediateU1()) {
+ code.sub(result, *op_arg);
+ } else {
+ code.stc();
+ code.sbb(result, *op_arg);
+ }
+ } else {
+ code.bt(carry.cvt32(), 0);
+ code.cmc();
+ code.sbb(result, *op_arg);
+ }
+ }
+
+ if (nzcv_inst) {
+ if (invert_output_carry) {
+ code.cmc();
+ }
+ code.lahf();
+ code.seto(code.al);
+ ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
+ }
+ if (carry_inst) {
+ if (invert_output_carry) {
+ code.setnc(carry);
+ } else {
+ code.setc(carry);
+ }
+ ctx.reg_alloc.DefineValue(carry_inst, carry);
+ }
+ if (overflow_inst) {
+ code.seto(overflow);
+ ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+ }
+ if (!is_cmp) {
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitSub(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitSub(code, ctx, inst, 64);
+}
+
+void EmitX64::EmitMul32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ if (args[1].IsImmediate()) {
+ code.imul(result, result, args[1].GetImmediateU32());
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(32);
+
+ code.imul(result, *op_arg);
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitMul64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+
+ code.imul(result, *op_arg);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitUnsignedMultiplyHigh64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ ctx.reg_alloc.UseScratch(args[0], HostLoc::RAX);
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ code.mul(*op_arg);
+
+ ctx.reg_alloc.DefineValue(inst, rdx);
+}
+
+void EmitX64::EmitSignedMultiplyHigh64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ ctx.reg_alloc.UseScratch(args[0], HostLoc::RAX);
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ code.imul(*op_arg);
+
+ ctx.reg_alloc.DefineValue(inst, rdx);
+}
+
+void EmitX64::EmitUnsignedDiv32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ const Xbyak::Reg32 dividend = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 divisor = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+
+ Xbyak::Label end;
+
+ code.xor_(eax, eax);
+ code.test(divisor, divisor);
+ code.jz(end);
+ code.mov(eax, dividend);
+ code.xor_(edx, edx);
+ code.div(divisor);
+ code.L(end);
+
+ ctx.reg_alloc.DefineValue(inst, eax);
+}
+
+void EmitX64::EmitUnsignedDiv64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ const Xbyak::Reg64 dividend = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 divisor = ctx.reg_alloc.UseGpr(args[1]);
+
+ Xbyak::Label end;
+
+ code.xor_(eax, eax);
+ code.test(divisor, divisor);
+ code.jz(end);
+ code.mov(rax, dividend);
+ code.xor_(edx, edx);
+ code.div(divisor);
+ code.L(end);
+
+ ctx.reg_alloc.DefineValue(inst, rax);
+}
+
+void EmitX64::EmitSignedDiv32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ const Xbyak::Reg32 dividend = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 divisor = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ Xbyak::Label end;
+
+ code.xor_(eax, eax);
+ code.test(divisor, divisor);
+ code.jz(end);
+ code.movsxd(rax, dividend);
+ code.movsxd(divisor.cvt64(), divisor);
+ code.cqo();
+ code.idiv(divisor.cvt64());
+ code.L(end);
+
+ ctx.reg_alloc.DefineValue(inst, eax);
+}
+
+void EmitX64::EmitSignedDiv64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ const Xbyak::Reg64 dividend = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 divisor = ctx.reg_alloc.UseGpr(args[1]);
+
+ Xbyak::Label end, ok;
+
+ code.xor_(eax, eax);
+ code.test(divisor, divisor);
+ code.jz(end);
+ code.cmp(divisor, 0xffffffff); // is sign extended
+ code.jne(ok);
+ code.mov(rax, 0x8000000000000000);
+ code.cmp(dividend, rax);
+ code.je(end);
+ code.L(ok);
+ code.mov(rax, dividend);
+ code.cqo();
+ code.idiv(divisor);
+ code.L(end);
+
+ ctx.reg_alloc.DefineValue(inst, rax);
+}
+
+void EmitX64::EmitAnd32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ if (args[1].IsImmediate()) {
+ const u32 op_arg = args[1].GetImmediateU32();
+
+ code.and_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(32);
+
+ code.and_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAnd64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+ if (args[1].FitsInImmediateS32()) {
+ const u32 op_arg = u32(args[1].GetImmediateS32());
+
+ code.and_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(64);
+
+ code.and_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAndNot32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) {
+ Xbyak::Reg32 op_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ Xbyak::Reg32 op_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.andn(result, op_b, op_a);
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ Xbyak::Reg32 result;
+ if (args[1].IsImmediate()) {
+ result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, u32(~args[1].GetImmediateU32()));
+ } else {
+ result = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+ code.not_(result);
+ }
+
+ if (args[0].IsImmediate()) {
+ const u32 op_arg = args[0].GetImmediateU32();
+ code.and_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]);
+ op_arg.setBit(32);
+ code.and_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAndNot64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) {
+ Xbyak::Reg64 op_a = ctx.reg_alloc.UseGpr(args[0]);
+ Xbyak::Reg64 op_b = ctx.reg_alloc.UseGpr(args[1]);
+ Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ code.andn(result, op_b, op_a);
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ Xbyak::Reg64 result;
+ if (args[1].IsImmediate()) {
+ result = ctx.reg_alloc.ScratchGpr();
+ code.mov(result, ~args[1].GetImmediateU64());
+ } else {
+ result = ctx.reg_alloc.UseScratchGpr(args[1]);
+ code.not_(result);
+ }
+
+ if (args[0].FitsInImmediateS32()) {
+ const u32 op_arg = u32(args[0].GetImmediateS32());
+ code.and_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]);
+ op_arg.setBit(64);
+ code.and_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitEor32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ if (args[1].IsImmediate()) {
+ const u32 op_arg = args[1].GetImmediateU32();
+
+ code.xor_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(32);
+
+ code.xor_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitEor64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+ if (args[1].FitsInImmediateS32()) {
+ const u32 op_arg = u32(args[1].GetImmediateS32());
+
+ code.xor_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(64);
+
+ code.xor_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitOr32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+
+ if (args[1].IsImmediate()) {
+ const u32 op_arg = args[1].GetImmediateU32();
+
+ code.or_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(32);
+
+ code.or_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitOr64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+ if (args[1].FitsInImmediateS32()) {
+ const u32 op_arg = u32(args[1].GetImmediateS32());
+
+ code.or_(result, op_arg);
+ } else {
+ OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+ op_arg.setBit(64);
+
+ code.or_(result, *op_arg);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitNot32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ Xbyak::Reg32 result;
+ if (args[0].IsImmediate()) {
+ result = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.mov(result, u32(~args[0].GetImmediateU32()));
+ } else {
+ result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ code.not_(result);
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitNot64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ Xbyak::Reg64 result;
+ if (args[0].IsImmediate()) {
+ result = ctx.reg_alloc.ScratchGpr();
+ code.mov(result, ~args[0].GetImmediateU64());
+ } else {
+ result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.not_(result);
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignExtendByteToWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movsx(result.cvt32(), result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movsx(result.cvt32(), result.cvt16());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignExtendByteToLong(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movsx(result.cvt64(), result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movsx(result.cvt64(), result.cvt16());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignExtendWordToLong(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movsxd(result.cvt64(), result.cvt32());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitZeroExtendByteToWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movzx(result.cvt32(), result.cvt8());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitZeroExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.movzx(result.cvt32(), result.cvt16());
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitZeroExtendByteToLong(EmitContext& ctx, IR::Inst* inst) {
+ // x64 zeros upper 32 bits on a 32-bit move
+ EmitZeroExtendByteToWord(ctx, inst);
+}
+
+void EmitX64::EmitZeroExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) {
+ // x64 zeros upper 32 bits on a 32-bit move
+ EmitZeroExtendHalfToWord(ctx, inst);
+}
+
+void EmitX64::EmitZeroExtendWordToLong(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.mov(result.cvt32(), result.cvt32()); // x64 zeros upper 32 bits on a 32-bit move
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitZeroExtendLongToQuad(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (args[0].IsInGpr()) {
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.movq(result, source);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.movq(result, result);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitByteReverseWord(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ code.bswap(result);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitByteReverseHalf(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg16 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt16();
+ code.rol(result, 8);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.bswap(result);
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::LZCNT)) {
+ const Xbyak::Reg32 source = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.lzcnt(result, source);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Reg32 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // The result of a bsr of zero is undefined, but zf is set after it.
+ code.bsr(result, source);
+ code.mov(source, 0xFFFFFFFF);
+ code.cmovz(result, source);
+ code.neg(result);
+ code.add(result, 31);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::LZCNT)) {
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]).cvt64();
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
+
+ code.lzcnt(result, source);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt64();
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
+
+ // The result of a bsr of zero is undefined, but zf is set after it.
+ code.bsr(result, source);
+ code.mov(source.cvt32(), 0xFFFFFFFF);
+ code.cmovz(result.cvt32(), source.cvt32());
+ code.neg(result.cvt32());
+ code.add(result.cvt32(), 63);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitMaxSigned32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ code.cmp(x, y);
+ code.cmovge(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMaxSigned64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+ code.cmp(x, y);
+ code.cmovge(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMaxUnsigned32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ code.cmp(x, y);
+ code.cmova(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMaxUnsigned64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+ code.cmp(x, y);
+ code.cmova(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMinSigned32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ code.cmp(x, y);
+ code.cmovle(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMinSigned64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+ code.cmp(x, y);
+ code.cmovle(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMinUnsigned32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ code.cmp(x, y);
+ code.cmovb(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitMinUnsigned64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+ code.cmp(x, y);
+ code.cmovb(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
new file mode 100644
index 0000000000..182c887538
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@@ -0,0 +1,2134 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include <mcl/assert.hpp>
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <mcl/mp/typelist/cartesian_product.hpp>
+#include <mcl/mp/typelist/get.hpp>
+#include <mcl/mp/typelist/lift_sequence.hpp>
+#include <mcl/mp/typelist/list.hpp>
+#include <mcl/mp/typelist/lower_to_tuple.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/constants.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/cast_util.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/op.h"
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/common/lut_from_list.h"
+#include "dynarmic/interface/optimization_flags.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+namespace mp = mcl::mp;
+
+namespace {
+
+const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1);
+
+constexpr u64 f32_negative_zero = 0x80000000u;
+constexpr u64 f32_nan = 0x7fc00000u;
+constexpr u64 f32_non_sign_mask = 0x7fffffffu;
+constexpr u64 f32_smallest_normal = 0x00800000u;
+
+constexpr u64 f64_negative_zero = 0x8000000000000000u;
+constexpr u64 f64_nan = 0x7ff8000000000000u;
+constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu;
+constexpr u64 f64_smallest_normal = 0x0010000000000000u;
+
+constexpr u64 f64_min_s16 = 0xc0e0000000000000u; // -32768 as a double
+constexpr u64 f64_max_s16 = 0x40dfffc000000000u; // 32767 as a double
+constexpr u64 f64_min_u16 = 0x0000000000000000u; // 0 as a double
+constexpr u64 f64_max_u16 = 0x40efffe000000000u; // 65535 as a double
+constexpr u64 f64_max_s32 = 0x41dfffffffc00000u; // 2147483647 as a double
+constexpr u64 f64_max_u32 = 0x41efffffffe00000u; // 4294967295 as a double
+constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actual maximum unrepresentable)
+
+#define FCODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (fsize == 32) { \
+ code.NAME##s(args...); \
+ } else { \
+ code.NAME##d(args...); \
+ } \
+ }
+#define ICODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (fsize == 32) { \
+ code.NAME##d(args...); \
+ } else { \
+ code.NAME##q(args...); \
+ } \
+ }
+
+template<size_t fsize>
+void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> to_daz) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ constexpr u32 denormal_to_zero = FixupLUT(
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src);
+
+ const Xbyak::Xmm tmp = xmm16;
+ FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero));
+
+ for (const Xbyak::Xmm& xmm : to_daz) {
+ FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
+ }
+ return;
+ }
+
+ for (const Xbyak::Xmm& xmm : to_daz) {
+ code.movaps(xmm0, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
+ code.andps(xmm0, xmm);
+ if constexpr (fsize == 32) {
+ code.pcmpgtd(xmm0, code.Const(xword, f32_smallest_normal - 1));
+ } else if (code.HasHostFeature(HostFeature::SSE42)) {
+ code.pcmpgtq(xmm0, code.Const(xword, f64_smallest_normal - 1));
+ } else {
+ code.pcmpgtd(xmm0, code.Const(xword, f64_smallest_normal - 1));
+ code.pshufd(xmm0, xmm0, 0b11100101);
+ }
+ code.orps(xmm0, code.Const(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero));
+ code.andps(xmm, xmm0);
+ }
+}
+
+template<size_t fsize>
+void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
+ if (ctx.FPCR().FZ()) {
+ ForceDenormalsToZero<fsize>(code, to_daz);
+ }
+}
+
+template<size_t fsize>
+void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
+ FpFixup::PosZero);
+ FCODE(vfixupimms)(xmm_value, xmm_value, code.Const(ptr, u64(nan_to_zero)), u8(0));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpords)(xmm_scratch, xmm_value, xmm_value);
+ FCODE(vandp)(xmm_value, xmm_value, xmm_scratch);
+ } else {
+ code.xorps(xmm_scratch, xmm_scratch);
+ FCODE(cmpords)(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN)
+ code.pand(xmm_value, xmm_scratch);
+ }
+}
+
+template<size_t fsize>
+void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ const Xbyak::Opmask nan_mask = k1;
+ FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
+ FCODE(vblendmp)(result | nan_mask, result, code.Const(ptr_b, fsize == 32 ? f32_nan : f64_nan));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpunords)(xmm0, result, result);
+ FCODE(blendvp)(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
+ } else {
+ Xbyak::Label end;
+ FCODE(ucomis)(result, result);
+ code.jnp(end);
+ code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
+ code.L(end);
+ }
+}
+
+template<size_t fsize>
+SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) {
+ SharedLabel nan = GenSharedLabel(), end = GenSharedLabel();
+
+ FCODE(ucomis)(a, a);
+ code.jp(*nan, code.T_NEAR);
+
+ ctx.deferred_emits.emplace_back([=, &code] {
+ code.L(*nan);
+ code.orps(a, code.Const(xword, fsize == 32 ? 0x00400000 : 0x0008'0000'0000'0000));
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ return end;
+}
+
+template<size_t fsize>
+void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) {
+ code.movaps(tmp, result);
+ FCODE(cmpunordp)(tmp, tmp);
+ ICODE(psll)(tmp, int(fsize - 1));
+ code.xorps(result, tmp);
+}
+
+// This is necessary because x86 and ARM differ in they way they return NaNs from floating point operations
+//
+// ARM behaviour:
+// op1 op2 result
+// SNaN SNaN/QNaN op1
+// QNaN SNaN op2
+// QNaN QNaN op1
+// SNaN/QNaN other op1
+// other SNaN/QNaN op2
+//
+// x86 behaviour:
+// op1 op2 result
+// SNaN/QNaN SNaN/QNaN op1
+// SNaN/QNaN other op1
+// other SNaN/QNaN op2
+//
+// With ARM: SNaNs take priority. With x86: it doesn't matter.
+//
+// From the above we can see what differs between the architectures is
+// the case when op1 == QNaN and op2 == SNaN.
+//
+// We assume that registers op1 and op2 are read-only. This function also trashes xmm0.
+// We allow for the case where op1 and result are the same register. We do not read from op1 once result is written to.
+template<size_t fsize>
+void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, Xbyak::Xmm op2, Xbyak::Reg64 tmp, Xbyak::Label end) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT exponent_mask = FP::FPInfo<FPT>::exponent_mask;
+ constexpr FPT mantissa_msb = FP::FPInfo<FPT>::mantissa_msb;
+ constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
+
+ // At this point we know that at least one of op1 and op2 is a NaN.
+ // Thus in op1 ^ op2 at least one of the two would have all 1 bits in the exponent.
+ // Keeping in mind xor is commutative, there are only four cases:
+ // SNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 0
+ // QNaN ^ QNaN -> exponent == 0, mantissa_msb == 0
+ // QNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 1
+ // SNaN/QNaN ^ Otherwise -> exponent != 0, mantissa_msb == ?
+ //
+ // We're only really interested in op1 == QNaN and op2 == SNaN,
+ // so we filter out everything else.
+ //
+ // We do it this way instead of checking that op1 is QNaN because
+ // op1 == QNaN && op2 == QNaN is the most common case. With this method
+ // that case would only require one branch.
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vxorps(xmm0, op1, op2);
+ } else {
+ code.movaps(xmm0, op1);
+ code.xorps(xmm0, op2);
+ }
+
+ constexpr size_t shift = fsize == 32 ? 0 : 48;
+ if constexpr (fsize == 32) {
+ code.movd(tmp.cvt32(), xmm0);
+ } else {
+ // We do this to avoid requiring 64-bit immediates
+ code.pextrw(tmp.cvt32(), xmm0, shift / 16);
+ }
+ code.and_(tmp.cvt32(), static_cast<u32>((exponent_mask | mantissa_msb) >> shift));
+ code.cmp(tmp.cvt32(), static_cast<u32>(mantissa_msb >> shift));
+ code.jne(end, code.T_NEAR);
+
+ // If we're here there are four cases left:
+ // op1 == SNaN && op2 == QNaN
+ // op1 == Inf && op2 == QNaN
+ // op1 == QNaN && op2 == SNaN <<< The problematic case
+ // op1 == QNaN && op2 == Inf
+
+ if constexpr (fsize == 32) {
+ code.movd(tmp.cvt32(), op2);
+ code.shl(tmp.cvt32(), 32 - mantissa_msb_bit);
+ } else {
+ code.movq(tmp, op2);
+ code.shl(tmp, 64 - mantissa_msb_bit);
+ }
+ // If op2 is a SNaN, CF = 0 and ZF = 0.
+ code.jna(end, code.T_NEAR);
+
+ // Silence the SNaN as required by spec.
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vorps(result, op2, code.Const(xword, mantissa_msb));
+ } else {
+ code.movaps(result, op2);
+ code.orps(result, code.Const(xword, mantissa_msb));
+ }
+ code.jmp(end, code.T_NEAR);
+}
+
+template<size_t fsize, typename Function>
+void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ SharedLabel end = GenSharedLabel();
+
+ Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (!ctx.FPCR().DN() && !ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ end = ProcessNaN<fsize>(code, ctx, result);
+ }
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ (code.*fn)(result, result);
+ } else {
+ fn(result);
+ }
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ // Do nothing
+ } else if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<fsize>(code, result);
+ } else {
+ PostProcessNaN<fsize>(code, result, xmm0);
+ }
+ code.L(*end);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<size_t fsize, typename Function>
+void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (ctx.FPCR().DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ (code.*fn)(result, operand);
+ } else {
+ fn(result, operand);
+ }
+
+ if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ ForceToDefaultNaN<fsize>(code, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+
+ SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+
+ code.movaps(result, op1);
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ (code.*fn)(result, op2);
+ } else {
+ fn(result, op2);
+ }
+ FCODE(ucomis)(result, result);
+ code.jp(*nan, code.T_NEAR);
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code] {
+ Xbyak::Label op_are_nans;
+
+ code.L(*nan);
+ FCODE(ucomis)(op1, op2);
+ code.jp(op_are_nans);
+ // Here we must return a positive NaN, because the indefinite value on x86 is a negative NaN!
+ code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
+ code.jmp(*end, code.T_NEAR);
+ code.L(op_are_nans);
+ EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+} // anonymous namespace
+
+template<size_t fsize>
+void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Address mask = code.Const(xword, non_sign_mask);
+
+ code.andps(result, mask);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
+ FPAbs<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
+ FPAbs<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) {
+ FPAbs<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Address mask = code.Const(xword, u64(sign_mask));
+
+ code.xorps(result, mask);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) {
+ FPNeg<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) {
+ FPNeg<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) {
+ FPNeg<64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::addss);
+}
+
+void EmitX64::EmitFPAdd64(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::addsd);
+}
+
+void EmitX64::EmitFPDiv32(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::divss);
+}
+
+void EmitX64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::divsd);
+}
+
+template<size_t fsize, bool is_max>
+static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr();
+
+ DenormalsAreZero<fsize>(code, ctx, {result, operand});
+
+ SharedLabel equal = GenSharedLabel(), end = GenSharedLabel();
+
+ FCODE(ucomis)(result, operand);
+ code.jz(*equal, code.T_NEAR);
+ if constexpr (is_max) {
+ FCODE(maxs)(result, operand);
+ } else {
+ FCODE(mins)(result, operand);
+ }
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ Xbyak::Label nan;
+
+ code.L(*equal);
+ code.jp(nan);
+ if constexpr (is_max) {
+ code.andps(result, operand);
+ } else {
+ code.orps(result, operand);
+ }
+ code.jmp(*end);
+
+ code.L(nan);
+ if (ctx.FPCR().DN()) {
+ code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan));
+ code.jmp(*end);
+ } else {
+ code.movaps(tmp, result);
+ FCODE(adds)(result, operand);
+ EmitPostProcessNaNs<fsize>(code, result, tmp, operand, gpr_scratch, *end);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<size_t fsize, bool is_max>
+static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT default_nan = FP::FPInfo<FPT>::DefaultNaN();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm op1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm op2 = ctx.reg_alloc.UseScratchXmm(args[1]); // Result stored here!
+
+ DenormalsAreZero<fsize>(code, ctx, {op1, op2});
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ // vrangep{s,d} will already correctly handle comparing
+ // signed zeros and propagating NaNs similar to ARM
+ constexpr FpRangeSelect range_select = is_max ? FpRangeSelect::Max : FpRangeSelect::Min;
+ FCODE(vranges)(op2, op1, op2, FpRangeLUT(range_select, FpRangeSign::Preserve));
+
+ if (ctx.FPCR().DN()) {
+ FCODE(vcmps)(k1, op2, op2, Cmp::Unordered_Q);
+ FCODE(vmovs)(op2 | k1, code.Const(xword, default_nan));
+ }
+ } else {
+ Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr();
+ tmp.setBit(fsize);
+
+ const auto move_to_tmp = [=, &code](const Xbyak::Xmm& xmm) {
+ if constexpr (fsize == 32) {
+ code.movd(tmp.cvt32(), xmm);
+ } else {
+ code.movq(tmp.cvt64(), xmm);
+ }
+ };
+
+ SharedLabel end = GenSharedLabel(), z = GenSharedLabel();
+
+ FCODE(ucomis)(op1, op2);
+ code.jz(*z, code.T_NEAR);
+ if constexpr (is_max) {
+ FCODE(maxs)(op2, op1);
+ } else {
+ FCODE(mins)(op2, op1);
+ }
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ Xbyak::Label nan, op2_is_nan, snan, maybe_both_nan;
+
+ constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
+
+ code.L(*z);
+ code.jp(nan);
+ if constexpr (is_max) {
+ code.andps(op2, op1);
+ } else {
+ code.orps(op2, op1);
+ }
+ code.jmp(*end);
+
+ // NaN requirements:
+ // op1 op2 result
+ // SNaN anything op1
+ // !SNaN SNaN op2
+ // QNaN !NaN op2
+ // !NaN QNaN op1
+ // QNaN QNaN op1
+
+ code.L(nan);
+ FCODE(ucomis)(op1, op1);
+ code.jnp(op2_is_nan);
+
+ // op1 is NaN
+ move_to_tmp(op1);
+ code.bt(tmp, mantissa_msb_bit);
+ code.jc(maybe_both_nan);
+ if (ctx.FPCR().DN()) {
+ code.L(snan);
+ code.movaps(op2, code.Const(xword, default_nan));
+ code.jmp(*end);
+ } else {
+ code.movaps(op2, op1);
+ code.L(snan);
+ code.orps(op2, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb));
+ code.jmp(*end);
+ }
+
+ code.L(maybe_both_nan);
+ FCODE(ucomis)(op2, op2);
+ code.jnp(*end, code.T_NEAR);
+ if (ctx.FPCR().DN()) {
+ code.jmp(snan);
+ } else {
+ move_to_tmp(op2);
+ code.bt(tmp.cvt64(), mantissa_msb_bit);
+ code.jnc(snan);
+ code.movaps(op2, op1);
+ code.jmp(*end);
+ }
+
+ // op2 is NaN
+ code.L(op2_is_nan);
+ move_to_tmp(op2);
+ code.bt(tmp, mantissa_msb_bit);
+ code.jnc(snan);
+ code.movaps(op2, op1);
+ code.jmp(*end);
+ });
+ }
+
+ ctx.reg_alloc.DefineValue(inst, op2);
+}
+
+void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMax<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMax<64, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMaxNumeric<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMaxNumeric<64, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMax<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMax<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMaxNumeric<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMinMaxNumeric<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::mulss);
+}
+
+void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::mulsd);
+}
+
+template<size_t fsize, bool negate_product>
+static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ const auto fallback_fn = negate_product ? &FP::FPMulSub<FPT> : &FP::FPMulAdd<FPT>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if constexpr (fsize != 16) {
+ const bool needs_rounding_correction = ctx.FPCR().FZ();
+ const bool needs_nan_correction = !ctx.FPCR().DN();
+
+ if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
+
+ if constexpr (negate_product) {
+ FCODE(vfnmadd231s)(result, operand2, operand3);
+ } else {
+ FCODE(vfmadd231s)(result, operand2, operand3);
+ }
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<fsize>(code, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
+ SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, operand1);
+ if constexpr (negate_product) {
+ FCODE(vfnmadd231s)(result, operand2, operand3);
+ } else {
+ FCODE(vfmadd231s)(result, operand2, operand3);
+ }
+
+ if (needs_rounding_correction && needs_nan_correction) {
+ code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
+ FCODE(ucomis)(xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
+ code.jz(*fallback, code.T_NEAR);
+ } else if (needs_rounding_correction) {
+ code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
+ code.vxorps(xmm0, xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal));
+ code.ptest(xmm0, xmm0);
+ code.jz(*fallback, code.T_NEAR);
+ } else if (needs_nan_correction) {
+ FCODE(ucomis)(result, result);
+ code.jp(*fallback, code.T_NEAR);
+ } else {
+ UNREACHABLE();
+ }
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<fsize>(code, result);
+ }
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+
+ Xbyak::Label nan;
+
+ if (needs_rounding_correction && needs_nan_correction) {
+ code.jp(nan, code.T_NEAR);
+ }
+
+ if (needs_rounding_correction) {
+ // x64 rounds before flushing to zero
+ // AArch64 rounds after flushing to zero
+ // This difference of behaviour is noticable if something would round to a smallest normalized number
+
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.movq(code.ABI_PARAM1, operand1);
+ code.movq(code.ABI_PARAM2, operand2);
+ code.movq(code.ABI_PARAM3, operand3);
+ code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
+#ifdef _WIN32
+ code.sub(rsp, 16 + ABI_SHADOW_SPACE);
+ code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
+ code.CallFunction(fallback_fn);
+ code.add(rsp, 16 + ABI_SHADOW_SPACE);
+#else
+ code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(fallback_fn);
+#endif
+ code.movq(result, code.ABI_RETURN);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end);
+ }
+
+ if (needs_nan_correction) {
+ code.L(nan);
+
+ // AArch64 preferentially returns the first SNaN over the first QNaN
+ // For x64 vfmadd231ss, x64 returns the first of {op2, op3, op1} that is a NaN, irregardless of signalling state
+
+ Xbyak::Label has_nan, indeterminate, op1_snan, op1_done, op2_done, op3_done;
+
+ code.vmovaps(xmm0, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb));
+
+ FCODE(ucomis)(operand2, operand3);
+ code.jp(has_nan);
+ FCODE(ucomis)(operand1, operand1);
+ code.jnp(indeterminate);
+
+ // AArch64 specifically emits a default NaN for the case when the addend is a QNaN and the two other arguments are {inf, zero}
+ code.ptest(operand1, xmm0);
+ code.jz(op1_snan);
+ FCODE(vmuls)(xmm0, operand2, operand3); // check if {op2, op3} are {inf, zero}/{zero, inf}
+ FCODE(ucomis)(xmm0, xmm0);
+ code.jnp(*end);
+
+ code.L(indeterminate);
+ code.vmovaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
+ code.jmp(*end);
+
+ code.L(has_nan);
+
+ FCODE(ucomis)(operand1, operand1);
+ code.jnp(op1_done);
+ code.movaps(result, operand1); // this is done because of NaN behavior of vfmadd231s (priority of op2, op3, op1)
+ code.ptest(operand1, xmm0);
+ code.jnz(op1_done);
+ code.L(op1_snan);
+ code.vorps(result, operand1, xmm0);
+ code.jmp(*end);
+ code.L(op1_done);
+
+ FCODE(ucomis)(operand2, operand2);
+ code.jnp(op2_done);
+ code.ptest(operand2, xmm0);
+ code.jnz(op2_done);
+ code.vorps(result, operand2, xmm0);
+ if constexpr (negate_product) {
+ code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
+ }
+ code.jmp(*end);
+ code.L(op2_done);
+
+ FCODE(ucomis)(operand3, operand3);
+ code.jnp(op3_done);
+ code.ptest(operand3, xmm0);
+ code.jnz(op3_done);
+ code.vorps(result, operand3, xmm0);
+ code.jmp(*end);
+ code.L(op3_done);
+
+ // at this point, all SNaNs have been handled
+ // if op1 was not a QNaN and op2 is, negate the result
+ if constexpr (negate_product) {
+ FCODE(ucomis)(operand1, operand1);
+ code.jp(*end);
+ FCODE(ucomis)(operand2, operand2);
+ code.jnp(*end);
+ code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
+ }
+
+ code.jmp(*end);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
+
+ if constexpr (negate_product) {
+ code.xorps(operand2, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
+ }
+ FCODE(muls)(operand2, operand3);
+ FCODE(adds)(operand1, operand2);
+
+ ctx.reg_alloc.DefineValue(inst, operand1);
+ return;
+ }
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0], args[1], args[2]);
+ code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value());
+#ifdef _WIN32
+ ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
+ code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE], rax);
+ code.CallFunction(fallback_fn);
+ ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
+#else
+ code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(fallback_fn);
+#endif
+}
+
+void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<16, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<16, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulAdd<64, true>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const bool do_default_nan = ctx.FPCR().DN();
+
+ const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr();
+
+ SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vmuls)(result, op1, op2);
+ } else {
+ code.movaps(result, op1);
+ FCODE(muls)(result, op2);
+ }
+ FCODE(ucomis)(result, result);
+ code.jp(*nan, code.T_NEAR);
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code] {
+ Xbyak::Label op_are_nans;
+
+ code.L(*nan);
+ FCODE(ucomis)(op1, op2);
+ code.jp(op_are_nans);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vxorps(result, op1, op2);
+ } else {
+ code.movaps(result, op1);
+ code.xorps(result, op2);
+ }
+ code.andps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask));
+ code.orps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
+ code.jmp(*end, code.T_NEAR);
+ code.L(op_are_nans);
+ if (do_default_nan) {
+ code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN()));
+ code.jmp(*end, code.T_NEAR);
+ } else {
+ EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPMulX32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulX<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPMulX<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ if constexpr (fsize != 16) {
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ FCODE(vrcp14s)(result, operand, operand);
+ } else {
+ if constexpr (fsize == 32) {
+ code.rcpss(result, operand);
+ } else {
+ code.cvtsd2ss(result, operand);
+ code.rcpss(result, result);
+ code.cvtss2sd(result, result);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRecipEstimate<FPT>);
+}
+
+void EmitX64::EmitFPRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipEstimate<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipEstimate<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipEstimate<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRecipExponent<FPT>);
+}
+
+void EmitX64::EmitFPRecipExponent16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipExponent<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipExponent32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipExponent<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipExponent<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::FMA) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ Xbyak::Label end, fallback;
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
+ FCODE(vfnmadd231s)(result, operand1, operand2);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA)) {
+ SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
+ FCODE(vfnmadd231s)(result, operand1, operand2);
+ FCODE(ucomis)(result, result);
+ code.jp(*fallback, code.T_NEAR);
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.movq(code.ABI_PARAM1, operand1);
+ code.movq(code.ABI_PARAM2, operand2);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRecipStepFused<FPT>);
+ code.movq(result, code.ABI_RETURN);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>()));
+ FCODE(muls)(operand1, operand2);
+ FCODE(subs)(result, operand1);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0], args[1]);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRecipStepFused<FPT>);
+}
+
+void EmitX64::EmitFPRecipStepFused16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipStepFused<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipStepFused<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRecipStepFused<64>(code, ctx, inst);
+}
+
+static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+ const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
+
+ if (fsize != 16 && code.HasHostFeature(HostFeature::SSE41) && round_imm && !exact) {
+ if (fsize == 64) {
+ FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) {
+ code.roundsd(result, result, *round_imm);
+ });
+ } else {
+ FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) {
+ code.roundss(result, result, *round_imm);
+ });
+ }
+
+ return;
+ }
+
+ using fsize_list = mp::list<mp::lift_value<size_t(16)>,
+ mp::lift_value<size_t(32)>,
+ mp::lift_value<size_t(64)>>;
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+ using exact_list = mp::list<std::true_type, std::false_type>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) {
+ constexpr size_t fsize = mp::get<0, I>::value;
+ constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
+ constexpr bool exact = mp::get<2, I>::value;
+ using InputSize = mcl::unsigned_integer_of_size<fsize>;
+
+ return FP::FPRoundInt<InputSize>(static_cast<InputSize>(input), fpcr, rounding_mode, exact, fpsr);
+ })};
+ },
+ mp::cartesian_product<fsize_list, rounding_list, exact_list>{});
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.CallFunction(lut.at(std::make_tuple(fsize, rounding_mode, exact)));
+}
+
+void EmitX64::EmitFPRoundInt16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRound(code, ctx, inst, 16);
+}
+
+void EmitX64::EmitFPRoundInt32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRound(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitFPRoundInt64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRound(code, ctx, inst, 64);
+}
+
+template<size_t fsize>
+static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ if constexpr (fsize != 16) {
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ FCODE(vrsqrt14s)(result, operand, operand);
+ } else {
+ if constexpr (fsize == 32) {
+ code.rsqrtss(result, operand);
+ } else {
+ code.cvtsd2ss(result, operand);
+ code.rsqrtss(result, result);
+ code.cvtss2sd(result, result);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm();
+ [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+
+ code.movaps(value, operand);
+
+ code.movaps(xmm0, code.Const(xword, fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000));
+ code.pand(value, xmm0);
+ code.por(value, code.Const(xword, fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000));
+
+ // Detect NaNs, negatives, zeros, denormals and infinities
+ FCODE(ucomis)(value, code.Const(xword, FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width));
+ code.jna(*bad_values, code.T_NEAR);
+
+ FCODE(sqrts)(value, value);
+ ICODE(mov)(result, code.Const(xword, FP::FPValue<FPT, false, 0, 1>()));
+ FCODE(divs)(result, value);
+
+ ICODE(padd)(result, code.Const(xword, fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000));
+ code.pand(result, xmm0);
+
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ Xbyak::Label fallback, default_nan;
+ bool needs_fallback = false;
+
+ code.L(*bad_values);
+ if constexpr (fsize == 32) {
+ code.movd(tmp, operand);
+
+ if (!ctx.FPCR().FZ()) {
+ if (ctx.FPCR().DN()) {
+ // a > 0x80000000
+ code.cmp(tmp, 0x80000000);
+ code.ja(default_nan, code.T_NEAR);
+ }
+
+ // a > 0 && a < 0x00800000;
+ code.sub(tmp, 1);
+ code.cmp(tmp, 0x007FFFFF);
+ code.jb(fallback);
+ needs_fallback = true;
+ }
+
+ code.rsqrtss(result, operand);
+
+ if (ctx.FPCR().DN()) {
+ code.ucomiss(result, result);
+ code.jnp(*end, code.T_NEAR);
+ } else {
+ // FZ ? (a >= 0x80800000 && a <= 0xFF800000) : (a >= 0x80000001 && a <= 0xFF800000)
+ // !FZ path takes into account the subtraction by one from the earlier block
+ code.add(tmp, ctx.FPCR().FZ() ? 0x7F800000 : 0x80000000);
+ code.cmp(tmp, ctx.FPCR().FZ() ? 0x7F000001 : 0x7F800000);
+ code.jnb(*end, code.T_NEAR);
+ }
+
+ code.L(default_nan);
+ code.movd(result, code.Const(xword, 0x7FC00000));
+ code.jmp(*end, code.T_NEAR);
+ } else {
+ Xbyak::Label nan, zero;
+
+ code.movaps(value, operand);
+ DenormalsAreZero<fsize>(code, ctx, {value});
+ code.pxor(result, result);
+
+ code.ucomisd(value, result);
+ if (ctx.FPCR().DN()) {
+ code.jc(default_nan);
+ code.je(zero);
+ } else {
+ code.jp(nan);
+ code.je(zero);
+ code.jc(default_nan);
+ }
+
+ if (!ctx.FPCR().FZ()) {
+ needs_fallback = true;
+ code.jmp(fallback);
+ } else {
+ // result = 0
+ code.jmp(*end, code.T_NEAR);
+ }
+
+ code.L(zero);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpor(result, value, code.Const(xword, 0x7FF0'0000'0000'0000));
+ } else {
+ code.movaps(result, value);
+ code.por(result, code.Const(xword, 0x7FF0'0000'0000'0000));
+ }
+ code.jmp(*end, code.T_NEAR);
+
+ code.L(nan);
+ if (!ctx.FPCR().DN()) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpor(result, operand, code.Const(xword, 0x0008'0000'0000'0000));
+ } else {
+ code.movaps(result, operand);
+ code.por(result, code.Const(xword, 0x0008'0000'0000'0000));
+ }
+ code.jmp(*end, code.T_NEAR);
+ }
+
+ code.L(default_nan);
+ code.movq(result, code.Const(xword, 0x7FF8'0000'0000'0000));
+ code.jmp(*end, code.T_NEAR);
+ }
+
+ code.L(fallback);
+ if (needs_fallback) {
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.movq(code.ABI_PARAM1, operand);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRSqrtEstimate<FPT>);
+ code.movq(result, rax);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRSqrtEstimate<FPT>);
+ }
+}
+
+void EmitX64::EmitFPRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtEstimate<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtEstimate<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtEstimate<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
+ FCODE(vfnmadd231s)(result, operand1, operand2);
+ FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
+ SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
+ FCODE(vfnmadd231s)(result, operand1, operand2);
+
+ // Detect if the intermediate result is infinity or NaN or nearly an infinity.
+ // Why do we need to care about infinities? This is because x86 doesn't allow us
+ // to fuse the divide-by-two with the rest of the FMA operation. Therefore the
+ // intermediate value may overflow and we would like to handle this case.
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.vpextrw(tmp, result, fsize == 32 ? 1 : 3);
+ code.and_(tmp.cvt16(), fsize == 32 ? 0x7f80 : 0x7ff0);
+ code.cmp(tmp.cvt16(), fsize == 32 ? 0x7f00 : 0x7fe0);
+ ctx.reg_alloc.Release(tmp);
+
+ code.jae(*fallback, code.T_NEAR);
+
+ FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.movq(code.ABI_PARAM1, operand1);
+ code.movq(code.ABI_PARAM2, operand2);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRSqrtStepFused<FPT>);
+ code.movq(result, code.ABI_RETURN);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>()));
+ FCODE(muls)(operand1, operand2);
+ FCODE(subs)(result, operand1);
+ FCODE(muls)(result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>()));
+
+ ctx.reg_alloc.DefineValue(inst, operand1);
+ return;
+ }
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0], args[1]);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPRSqrtStepFused<FPT>);
+}
+
+void EmitX64::EmitFPRSqrtStepFused16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtStepFused<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRSqrtStepFused32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtStepFused<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPRSqrtStepFused64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPRSqrtStepFused<64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) {
+ FPTwoOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::sqrtss);
+}
+
+void EmitX64::EmitFPSqrt64(EmitContext& ctx, IR::Inst* inst) {
+ FPTwoOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::sqrtsd);
+}
+
+void EmitX64::EmitFPSub32(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::subss);
+}
+
+void EmitX64::EmitFPSub64(EmitContext& ctx, IR::Inst* inst) {
+ FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::subsd);
+}
+
+static Xbyak::Reg64 SetFpscrNzcvFromFlags(BlockOfCode& code, EmitContext& ctx) {
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX); // shifting requires use of cl
+ const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr();
+
+ // x64 flags ARM flags
+ // ZF PF CF NZCV
+ // Unordered 1 1 1 0011
+ // Greater than 0 0 0 0010
+ // Less than 0 0 1 1000
+ // Equal 1 0 0 0110
+ //
+ // Thus we can take use ZF:CF as an index into an array like so:
+ // x64 ARM ARM as x64
+ // ZF:CF NZCV NZ-----C-------V
+ // 0 0010 0000000100000000 = 0x0100
+ // 1 1000 1000000000000000 = 0x8000
+ // 2 0110 0100000100000000 = 0x4100
+ // 3 0011 0000000100000001 = 0x0101
+
+ code.mov(nzcv, 0x0101'4100'8000'0100);
+ code.sete(cl);
+ code.rcl(cl, 5); // cl = ZF:CF:0000
+ code.shr(nzcv, cl);
+
+ return nzcv;
+}
+
+void EmitX64::EmitFPCompare32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm reg_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm reg_b = ctx.reg_alloc.UseXmm(args[1]);
+ const bool exc_on_qnan = args[2].GetImmediateU1();
+
+ if (exc_on_qnan) {
+ code.comiss(reg_a, reg_b);
+ } else {
+ code.ucomiss(reg_a, reg_b);
+ }
+
+ const Xbyak::Reg64 nzcv = SetFpscrNzcvFromFlags(code, ctx);
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitX64::EmitFPCompare64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm reg_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm reg_b = ctx.reg_alloc.UseXmm(args[1]);
+ const bool exc_on_qnan = args[2].GetImmediateU1();
+
+ if (exc_on_qnan) {
+ code.comisd(reg_a, reg_b);
+ } else {
+ code.ucomisd(reg_a, reg_b);
+ }
+
+ const Xbyak::Reg64 nzcv = SetFpscrNzcvFromFlags(code, ctx);
+ ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+
+ if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
+
+ // Double-conversion here is acceptable as this is expanding precision.
+ code.vcvtph2ps(result, value);
+ code.vcvtps2pd(result, result);
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<64>(code, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u64, u16>);
+}
+
+void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+
+ if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
+
+ code.vcvtph2ps(result, value);
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<32>(code, result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u32, u16>);
+}
+
+void EmitX64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+
+ // We special-case the non-IEEE-defined ToOdd rounding mode.
+ if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.cvtss2sd(result, result);
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<64>(code, result);
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u64, u32>);
+ }
+}
+
+void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+ const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
+
+ if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<32>(code, result);
+ }
+ code.vcvtps2ph(result, result, static_cast<u8>(*round_imm));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u16, u32>);
+}
+
+void EmitX64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+
+ // NOTE: Do not double-convert here as that is inaccurate.
+ // To be accurate, the first conversion would need to be "round-to-odd", which x64 doesn't support.
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u16, u64>);
+}
+
+void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8());
+
+ // We special-case the non-IEEE-defined ToOdd rounding mode.
+ if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.cvtsd2ss(result, result);
+ if (ctx.FPCR().DN()) {
+ ForceToDefaultNaN<32>(code, result);
+ }
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value());
+ code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode));
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.CallFunction(&FP::FPConvert<u32, u64>);
+ }
+}
+
+template<size_t fsize, bool unsigned_, size_t isize>
+static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const size_t fbits = args[1].GetImmediateU8();
+ const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+
+ if constexpr (fsize != 16) {
+ const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
+
+ // cvttsd2si truncates during operation so rounding (and thus SSE4.1) not required
+ const bool truncating = rounding_mode == FP::RoundingMode::TowardsZero;
+
+ if (round_imm && (truncating || code.HasHostFeature(HostFeature::SSE41))) {
+ const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64();
+
+ if constexpr (fsize == 64) {
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((fbits + 1023) << 52);
+ code.mulsd(src, code.Const(xword, scale_factor));
+ }
+
+ if (!truncating) {
+ code.roundsd(src, src, *round_imm);
+ }
+ } else {
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((fbits + 127) << 23);
+ code.mulss(src, code.Const(xword, scale_factor));
+ }
+
+ if (!truncating) {
+ code.roundss(src, src, *round_imm);
+ }
+
+ code.cvtss2sd(src, src);
+ }
+
+ if constexpr (isize == 64) {
+ const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
+
+ if (!unsigned_) {
+ SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel();
+
+ ZeroIfNaN<64>(code, src, scratch);
+
+ code.movsd(scratch, code.Const(xword, f64_max_s64_lim));
+ code.comisd(scratch, src);
+ code.jna(*saturate_max, code.T_NEAR);
+ code.cvttsd2si(result, src); // 64 bit gpr
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code] {
+ code.L(*saturate_max);
+ code.mov(result, 0x7FFF'FFFF'FFFF'FFFF);
+ code.jmp(*end, code.T_NEAR);
+ });
+ } else {
+ Xbyak::Label below_max;
+
+ const Xbyak::Reg64 result2 = ctx.reg_alloc.ScratchGpr().cvt64();
+
+ code.pxor(xmm0, xmm0);
+
+ code.movaps(scratch, src);
+ code.subsd(scratch, code.Const(xword, f64_max_s64_lim));
+
+ // these both result in zero if src/scratch are NaN
+ code.maxsd(src, xmm0);
+ code.maxsd(scratch, xmm0);
+
+ code.cvttsd2si(result, src);
+ code.cvttsd2si(result2, scratch);
+ code.or_(result, result2);
+
+ // when src < 2^63, result2 == 0, and result contains the final result
+ // when src >= 2^63, result contains 0x800.... and result2 contains the non-MSB bits
+ // MSB if result2 is 1 when src >= 2^64
+
+ code.sar(result2, 63);
+ code.or_(result, result2);
+ }
+ } else if constexpr (isize == 32) {
+ if (!unsigned_) {
+ const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
+
+ ZeroIfNaN<64>(code, src, scratch);
+ code.minsd(src, code.Const(xword, f64_max_s32));
+ // maxsd not required as cvttsd2si results in 0x8000'0000 when out of range
+ code.cvttsd2si(result.cvt32(), src); // 32 bit gpr
+ } else {
+ code.pxor(xmm0, xmm0);
+ code.maxsd(src, xmm0); // results in a zero if src is NaN
+ code.minsd(src, code.Const(xword, f64_max_u32));
+ code.cvttsd2si(result, src); // 64 bit gpr
+ }
+ } else {
+ const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm();
+
+ ZeroIfNaN<64>(code, src, scratch);
+ code.maxsd(src, code.Const(xword, unsigned_ ? f64_min_u16 : f64_min_s16));
+ code.minsd(src, code.Const(xword, unsigned_ ? f64_max_u16 : f64_max_s16));
+ code.cvttsd2si(result, src); // 64 bit gpr
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+
+ return;
+ }
+ }
+
+ using fbits_list = mp::lift_sequence<std::make_index_sequence<isize + 1>>;
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) {
+ constexpr size_t fbits = mp::get<0, I>::value;
+ constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ return FP::FPToFixed<FPT>(isize, static_cast<FPT>(input), fbits, unsigned_, fpcr, rounding_mode, fpsr);
+ })};
+ },
+ mp::cartesian_product<fbits_list, rounding_list>{});
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
+ code.CallFunction(lut.at(std::make_tuple(fbits, rounding_mode)));
+}
+
+void EmitX64::EmitFPDoubleToFixedS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, false, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPDoubleToFixedS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, false, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPDoubleToFixedS64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, false, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPDoubleToFixedU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, true, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPDoubleToFixedU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, true, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPDoubleToFixedU64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<64, true, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, false, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, false, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedS64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, false, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, true, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, true, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPHalfToFixedU64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<16, true, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, false, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, false, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedS64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, false, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, true, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, true, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPSingleToFixedU64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPToFixed<32, true, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPFixedS16ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.movsx(tmp, from);
+ code.cvtsi2ss(result, tmp);
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU16ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.movzx(tmp, from);
+ code.cvtsi2ss(result, tmp);
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedS32ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 from = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+
+ if (rounding_mode == ctx.FPCR().RMode() || ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) {
+ code.cvtsi2ss(result, from);
+ } else {
+ ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven);
+ code.EnterStandardASIMD();
+ code.cvtsi2ss(result, from);
+ code.LeaveStandardASIMD();
+ }
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+
+ const auto op = [&] {
+ if (code.HasHostFeature(HostFeature::AVX512F)) {
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ code.vcvtusi2ss(result, result, from.cvt32());
+ } else {
+ // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary
+ code.cvtsi2ss(result, from);
+ }
+ };
+
+ if (rounding_mode == ctx.FPCR().RMode() || ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) {
+ op();
+ } else {
+ ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven);
+ code.EnterStandardASIMD();
+ op();
+ code.LeaveStandardASIMD();
+ }
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedS16ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.movsx(tmp, from);
+ code.cvtsi2sd(result, tmp);
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU16ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.movzx(tmp, from);
+ code.cvtsi2sd(result, tmp);
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedS32ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 from = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.cvtsi2sd(result, from);
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm to = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required
+
+ code.xorps(to, to);
+
+ if (code.HasHostFeature(HostFeature::AVX512F)) {
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ code.vcvtusi2sd(to, to, from.cvt32());
+ } else {
+ // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary
+ code.cvtsi2sd(to, from);
+ }
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(to, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, to);
+}
+
+void EmitX64::EmitFPFixedS64ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ ASSERT(rounding_mode == ctx.FPCR().RMode());
+
+ code.cvtsi2sd(result, from);
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedS64ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ ASSERT(rounding_mode == ctx.FPCR().RMode());
+
+ code.cvtsi2ss(result, from);
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ ASSERT(rounding_mode == ctx.FPCR().RMode());
+
+ if (code.HasHostFeature(HostFeature::AVX512F)) {
+ code.vcvtusi2sd(result, result, from);
+ } else {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movq(tmp, from);
+ code.punpckldq(tmp, code.Const(xword, 0x4530000043300000, 0));
+ code.subpd(tmp, code.Const(xword, 0x4330000000000000, 0x4530000000000000));
+ code.pshufd(result, tmp, 0b01001110);
+ code.addpd(result, tmp);
+ if (ctx.FPCR().RMode() == FP::RoundingMode::TowardsMinusInfinity) {
+ code.pand(result, code.Const(xword, f64_non_sign_mask));
+ }
+ }
+
+ if (fbits != 0) {
+ const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52);
+ code.mulsd(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const size_t fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ ASSERT(rounding_mode == ctx.FPCR().RMode());
+
+ if (code.HasHostFeature(HostFeature::AVX512F)) {
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]);
+ code.vcvtusi2ss(result, result, from);
+ } else {
+ const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]);
+ code.pxor(result, result);
+
+ Xbyak::Label negative;
+ Xbyak::Label end;
+
+ code.test(from, from);
+ code.js(negative);
+
+ code.cvtsi2ss(result, from);
+ code.jmp(end);
+
+ code.L(negative);
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+ code.mov(tmp, from);
+ code.shr(tmp, 1);
+ code.and_(from.cvt32(), 1);
+ code.or_(from, tmp);
+ code.cvtsi2ss(result, from);
+ code.addss(result, result);
+
+ code.L(end);
+ }
+
+ if (fbits != 0) {
+ const u32 scale_factor = static_cast<u32>((127 - fbits) << 23);
+ code.mulss(result, code.Const(xword, scale_factor));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
new file mode 100644
index 0000000000..272b896ae3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
@@ -0,0 +1,545 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/macro/concatenate_tokens.hpp>
+
+#define AxxEmitX64 CONCATENATE_TOKENS(Axx, EmitX64)
+#define AxxEmitContext CONCATENATE_TOKENS(Axx, EmitContext)
+#define AxxJitState CONCATENATE_TOKENS(Axx, JitState)
+#define AxxUserConfig Axx::UserConfig
+
+namespace {
+using Vector = std::array<u64, 2>;
+}
+
+std::optional<AxxEmitX64::DoNotFastmemMarker> AxxEmitX64::ShouldFastmem(AxxEmitContext& ctx, IR::Inst* inst) const {
+ if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) {
+ return std::nullopt;
+ }
+
+ const auto marker = std::make_tuple(ctx.Location(), inst->GetName());
+ if (do_not_fastmem.count(marker) > 0) {
+ return std::nullopt;
+ }
+ return marker;
+}
+
+FakeCall AxxEmitX64::FastmemCallback(u64 rip_) {
+ const auto iter = fastmem_patch_info.find(rip_);
+
+ if (iter == fastmem_patch_info.end()) {
+ fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_);
+ fmt::print("Segfault wasn't at a fastmem patch location!\n");
+ fmt::print("Now dumping code.......\n\n");
+ Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000);
+ ASSERT_FALSE("iter != fastmem_patch_info.end()");
+ }
+
+ FakeCall result{
+ .call_rip = iter->second.callback,
+ .ret_rip = iter->second.resume_rip,
+ };
+
+ if (iter->second.recompile) {
+ const auto marker = iter->second.marker;
+ do_not_fastmem.insert(marker);
+ InvalidateBasicBlocks({std::get<0>(marker)});
+ }
+
+ return result;
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+ const auto fastmem_marker = ShouldFastmem(ctx, inst);
+
+ if (!conf.page_table && !fastmem_marker) {
+ // Neither fastmem nor page table: Use callbacks
+ if constexpr (bitsize == 128) {
+ ctx.reg_alloc.HostCall(nullptr, {}, args[1]);
+ if (ordered) {
+ code.mfence();
+ }
+ code.CallFunction(memory_read_128);
+ ctx.reg_alloc.DefineValue(inst, xmm1);
+ } else {
+ ctx.reg_alloc.HostCall(inst, {}, args[1]);
+ if (ordered) {
+ code.mfence();
+ }
+ Devirtualize<callback>(conf.callbacks).EmitCall(code);
+ code.ZeroExtendFrom(bitsize, code.ABI_RETURN);
+ }
+ EmitCheckMemoryAbort(ctx, inst);
+ return;
+ }
+
+ if (ordered && bitsize == 128) {
+ // Required for atomic 128-bit loads/stores
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ }
+
+ const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
+ const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx();
+
+ const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
+
+ SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+
+ if (fastmem_marker) {
+ // Use fastmem
+ bool require_abort_handling;
+ const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
+
+ const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered);
+
+ ctx.deferred_emits.emplace_back([=, this, &ctx] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+
+ fastmem_patch_info.emplace(
+ mcl::bit_cast<u64>(location),
+ FastmemPatchInfo{
+ mcl::bit_cast<u64>(code.getCurr()),
+ mcl::bit_cast<u64>(wrapped_fn),
+ *fastmem_marker,
+ conf.recompile_on_fastmem_failure,
+ });
+
+ EmitCheckMemoryAbort(ctx, inst, end.get());
+ code.jmp(*end, code.T_NEAR);
+ });
+ } else {
+ // Use page table
+ ASSERT(conf.page_table);
+ const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, *abort, vaddr);
+ EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered);
+
+ ctx.deferred_emits.emplace_back([=, this, &ctx] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+ EmitCheckMemoryAbort(ctx, inst, end.get());
+ code.jmp(*end, code.T_NEAR);
+ });
+ }
+ code.L(*end);
+
+ if constexpr (bitsize == 128) {
+ ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx});
+ } else {
+ ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx});
+ }
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+ const auto fastmem_marker = ShouldFastmem(ctx, inst);
+
+ if (!conf.page_table && !fastmem_marker) {
+ // Neither fastmem nor page table: Use callbacks
+ if constexpr (bitsize == 128) {
+ ctx.reg_alloc.Use(args[1], ABI_PARAM2);
+ ctx.reg_alloc.Use(args[2], HostLoc::XMM1);
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+ code.CallFunction(memory_write_128);
+ } else {
+ ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]);
+ Devirtualize<callback>(conf.callbacks).EmitCall(code);
+ }
+ if (ordered) {
+ code.mfence();
+ }
+ EmitCheckMemoryAbort(ctx, inst);
+ return;
+ }
+
+ if (ordered && bitsize == 128) {
+ // Required for atomic 128-bit loads/stores
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ }
+
+ const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
+ const int value_idx = bitsize == 128
+ ? ctx.reg_alloc.UseXmm(args[2]).getIdx()
+ : (ordered ? ctx.reg_alloc.UseScratchGpr(args[2]).getIdx() : ctx.reg_alloc.UseGpr(args[2]).getIdx());
+
+ const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
+
+ SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+
+ if (fastmem_marker) {
+ // Use fastmem
+ bool require_abort_handling;
+ const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
+
+ const auto location = EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered);
+
+ ctx.deferred_emits.emplace_back([=, this, &ctx] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+
+ fastmem_patch_info.emplace(
+ mcl::bit_cast<u64>(location),
+ FastmemPatchInfo{
+ mcl::bit_cast<u64>(code.getCurr()),
+ mcl::bit_cast<u64>(wrapped_fn),
+ *fastmem_marker,
+ conf.recompile_on_fastmem_failure,
+ });
+
+ EmitCheckMemoryAbort(ctx, inst, end.get());
+ code.jmp(*end, code.T_NEAR);
+ });
+ } else {
+ // Use page table
+ ASSERT(conf.page_table);
+ const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, *abort, vaddr);
+ EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered);
+
+ ctx.deferred_emits.emplace_back([=, this, &ctx] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+ EmitCheckMemoryAbort(ctx, inst, end.get());
+ code.jmp(*end, code.T_NEAR);
+ });
+ }
+ code.L(*end);
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) {
+ ASSERT(conf.global_monitor != nullptr);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool ordered = IsOrdered(args[2].GetImmediateAccType());
+
+ if constexpr (bitsize != 128) {
+ using T = mcl::unsigned_integer_of_size<bitsize>;
+
+ ctx.reg_alloc.HostCall(inst, {}, args[1]);
+
+ code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
+ code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
+ if (ordered) {
+ code.mfence();
+ }
+ code.CallLambda(
+ [](AxxUserConfig& conf, Axx::VAddr vaddr) -> T {
+ return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T {
+ return (conf.callbacks->*callback)(vaddr);
+ });
+ });
+ code.ZeroExtendFrom(bitsize, code.ABI_RETURN);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.Use(args[1], ABI_PARAM2);
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
+ code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
+ ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
+ if (ordered) {
+ code.mfence();
+ }
+ code.CallLambda(
+ [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& ret) {
+ ret = conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector {
+ return (conf.callbacks->*callback)(vaddr);
+ });
+ });
+ code.movups(result, xword[rsp + ABI_SHADOW_SPACE]);
+ ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+
+ EmitCheckMemoryAbort(ctx, inst);
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) {
+ ASSERT(conf.global_monitor != nullptr);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool ordered = IsOrdered(args[3].GetImmediateAccType());
+
+ if constexpr (bitsize != 128) {
+ ctx.reg_alloc.HostCall(inst, {}, args[1], args[2]);
+ } else {
+ ctx.reg_alloc.Use(args[1], ABI_PARAM2);
+ ctx.reg_alloc.Use(args[2], HostLoc::XMM1);
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(inst);
+ }
+
+ Xbyak::Label end;
+
+ code.mov(code.ABI_RETURN, u32(1));
+ code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
+ code.je(end);
+ code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
+ code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf));
+ if constexpr (bitsize != 128) {
+ using T = mcl::unsigned_integer_of_size<bitsize>;
+
+ code.CallLambda(
+ [](AxxUserConfig& conf, Axx::VAddr vaddr, T value) -> u32 {
+ return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr,
+ [&](T expected) -> bool {
+ return (conf.callbacks->*callback)(vaddr, value, expected);
+ })
+ ? 0
+ : 1;
+ });
+ if (ordered) {
+ code.mfence();
+ }
+ } else {
+ ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.movaps(xword[code.ABI_PARAM3], xmm1);
+ code.CallLambda(
+ [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& value) -> u32 {
+ return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr,
+ [&](Vector expected) -> bool {
+ return (conf.callbacks->*callback)(vaddr, value, expected);
+ })
+ ? 0
+ : 1;
+ });
+ if (ordered) {
+ code.mfence();
+ }
+ ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE);
+ }
+ code.L(end);
+
+ EmitCheckMemoryAbort(ctx, inst);
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) {
+ ASSERT(conf.global_monitor && conf.fastmem_pointer);
+ if (!exception_handler.SupportsFastmem()) {
+ EmitExclusiveReadMemory<bitsize, callback>(ctx, inst);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ constexpr bool ordered = true;
+
+ if constexpr (ordered && bitsize == 128) {
+ // Required for atomic 128-bit loads/stores
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ }
+
+ const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
+ const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx();
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr();
+
+ const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
+
+ EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
+
+ code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1));
+ code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id)));
+ code.mov(qword[tmp], vaddr);
+
+ const auto fastmem_marker = ShouldFastmem(ctx, inst);
+ if (fastmem_marker) {
+ SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+ bool require_abort_handling = false;
+
+ const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
+
+ const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered);
+
+ fastmem_patch_info.emplace(
+ mcl::bit_cast<u64>(location),
+ FastmemPatchInfo{
+ mcl::bit_cast<u64>(code.getCurr()),
+ mcl::bit_cast<u64>(wrapped_fn),
+ *fastmem_marker,
+ conf.recompile_on_exclusive_fastmem_failure,
+ });
+
+ code.L(*end);
+
+ if (require_abort_handling) {
+ ctx.deferred_emits.emplace_back([=, this] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+ code.jmp(*end, code.T_NEAR);
+ });
+ }
+ } else {
+ code.call(wrapped_fn);
+ }
+
+ code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id)));
+ EmitWriteMemoryMov<bitsize>(code, tmp, value_idx, false);
+
+ EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32());
+
+ if constexpr (bitsize == 128) {
+ ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx});
+ } else {
+ ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx});
+ }
+
+ EmitCheckMemoryAbort(ctx, inst);
+}
+
+template<std::size_t bitsize, auto callback>
+void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) {
+ ASSERT(conf.global_monitor && conf.fastmem_pointer);
+ if (!exception_handler.SupportsFastmem()) {
+ EmitExclusiveWriteMemory<bitsize, callback>(ctx, inst);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ constexpr bool ordered = true;
+
+ const auto value = [&] {
+ if constexpr (bitsize == 128) {
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RBX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RCX);
+ ctx.reg_alloc.ScratchGpr(HostLoc::RDX);
+ return ctx.reg_alloc.UseXmm(args[2]);
+ } else {
+ ctx.reg_alloc.ScratchGpr(HostLoc::RAX);
+ return ctx.reg_alloc.UseGpr(args[2]);
+ }
+ }();
+ const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]);
+ const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+
+ const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())];
+
+ EmitExclusiveLock(code, conf, tmp, eax);
+
+ SharedLabel end = GenSharedLabel();
+
+ code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id)));
+ code.mov(status, u32(1));
+ code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
+ code.je(*end, code.T_NEAR);
+ code.cmp(qword[tmp], vaddr);
+ code.jne(*end, code.T_NEAR);
+
+ EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax);
+
+ code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0));
+ code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id)));
+
+ if constexpr (bitsize == 128) {
+ code.mov(rax, qword[tmp + 0]);
+ code.mov(rdx, qword[tmp + 8]);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(rbx, value);
+ code.pextrq(rcx, value, 1);
+ } else {
+ code.movaps(xmm0, value);
+ code.movq(rbx, xmm0);
+ code.punpckhqdq(xmm0, xmm0);
+ code.movq(rcx, xmm0);
+ }
+ } else {
+ EmitReadMemoryMov<bitsize>(code, rax.getIdx(), tmp, false);
+ }
+
+ const auto fastmem_marker = ShouldFastmem(ctx, inst);
+ if (fastmem_marker) {
+ SharedLabel abort = GenSharedLabel();
+ bool require_abort_handling = false;
+
+ const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp);
+
+ const auto location = code.getCurr();
+
+ if constexpr (bitsize == 128) {
+ code.lock();
+ code.cmpxchg16b(ptr[dest_ptr]);
+ } else {
+ switch (bitsize) {
+ case 8:
+ code.lock();
+ code.cmpxchg(code.byte[dest_ptr], value.cvt8());
+ break;
+ case 16:
+ code.lock();
+ code.cmpxchg(word[dest_ptr], value.cvt16());
+ break;
+ case 32:
+ code.lock();
+ code.cmpxchg(dword[dest_ptr], value.cvt32());
+ break;
+ case 64:
+ code.lock();
+ code.cmpxchg(qword[dest_ptr], value.cvt64());
+ break;
+ default:
+ UNREACHABLE();
+ }
+ }
+
+ code.setnz(status.cvt8());
+
+ ctx.deferred_emits.emplace_back([=, this] {
+ code.L(*abort);
+ code.call(wrapped_fn);
+
+ fastmem_patch_info.emplace(
+ mcl::bit_cast<u64>(location),
+ FastmemPatchInfo{
+ mcl::bit_cast<u64>(code.getCurr()),
+ mcl::bit_cast<u64>(wrapped_fn),
+ *fastmem_marker,
+ conf.recompile_on_exclusive_fastmem_failure,
+ });
+
+ code.cmp(al, 0);
+ code.setz(status.cvt8());
+ code.movzx(status.cvt32(), status.cvt8());
+ code.jmp(*end, code.T_NEAR);
+ });
+ } else {
+ code.call(wrapped_fn);
+ code.cmp(al, 0);
+ code.setz(status.cvt8());
+ code.movzx(status.cvt32(), status.cvt8());
+ }
+
+ code.L(*end);
+
+ EmitExclusiveUnlock(code, conf, tmp, eax);
+
+ ctx.reg_alloc.DefineValue(inst, status);
+
+ EmitCheckMemoryAbort(ctx, inst);
+}
+
+#undef AxxEmitX64
+#undef AxxEmitContext
+#undef AxxJitState
+#undef AxxUserConfig
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
new file mode 100644
index 0000000000..c99980d617
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
@@ -0,0 +1,388 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit_cast.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/a32_emit_x64.h"
+#include "dynarmic/backend/x64/a64_emit_x64.h"
+#include "dynarmic/backend/x64/exclusive_monitor_friend.h"
+#include "dynarmic/common/spin_lock_x64.h"
+#include "dynarmic/interface/exclusive_monitor.h"
+#include "dynarmic/ir/acc_type.h"
+
+namespace Dynarmic::Backend::X64 {
+
+namespace {
+
+using namespace Xbyak::util;
+
+constexpr size_t page_bits = 12;
+constexpr size_t page_size = 1 << page_bits;
+constexpr size_t page_mask = (1 << page_bits) - 1;
+
+template<typename EmitContext>
+void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) {
+ if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) {
+ return;
+ }
+
+ const u32 align_mask = [bitsize]() -> u32 {
+ switch (bitsize) {
+ case 16:
+ return 0b1;
+ case 32:
+ return 0b11;
+ case 64:
+ return 0b111;
+ case 128:
+ return 0b1111;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ code.test(vaddr, align_mask);
+
+ if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
+ code.jnz(abort, code.T_NEAR);
+ return;
+ }
+
+ const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask;
+
+ SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
+
+ code.jnz(*detect_boundary, code.T_NEAR);
+ code.L(*resume);
+
+ ctx.deferred_emits.emplace_back([=, &code] {
+ code.L(*detect_boundary);
+ code.mov(tmp, vaddr);
+ code.and_(tmp, page_align_mask);
+ code.cmp(tmp, page_align_mask);
+ code.jne(*resume, code.T_NEAR);
+ // NOTE: We expect to fallthrough into abort code here.
+ });
+}
+
+template<typename EmitContext>
+Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr);
+
+template<>
+[[maybe_unused]] Xbyak::RegExp EmitVAddrLookup<A32EmitContext>(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) {
+ const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32();
+
+ EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp.cvt64());
+
+ // TODO: This code assumes vaddr has been zext from 32-bits to 64-bits.
+
+ code.mov(tmp, vaddr.cvt32());
+ code.shr(tmp, static_cast<int>(page_bits));
+
+ code.mov(page, qword[r14 + tmp.cvt64() * sizeof(void*)]);
+ if (ctx.conf.page_table_pointer_mask_bits == 0) {
+ code.test(page, page);
+ } else {
+ code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits);
+ }
+ code.jz(abort, code.T_NEAR);
+ if (ctx.conf.absolute_offset_page_table) {
+ return page + vaddr;
+ }
+ code.mov(tmp, vaddr.cvt32());
+ code.and_(tmp, static_cast<u32>(page_mask));
+ return page + tmp.cvt64();
+}
+
+template<>
+[[maybe_unused]] Xbyak::RegExp EmitVAddrLookup<A64EmitContext>(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) {
+ const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits;
+ const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits;
+
+ const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr();
+
+ EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp);
+
+ if (unused_top_bits == 0) {
+ code.mov(tmp, vaddr);
+ code.shr(tmp, int(page_bits));
+ } else if (ctx.conf.silently_mirror_page_table) {
+ if (valid_page_index_bits >= 32) {
+ if (code.HasHostFeature(HostFeature::BMI2)) {
+ const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr();
+ code.mov(bit_count, unused_top_bits);
+ code.bzhi(tmp, vaddr, bit_count);
+ code.shr(tmp, int(page_bits));
+ ctx.reg_alloc.Release(bit_count);
+ } else {
+ code.mov(tmp, vaddr);
+ code.shl(tmp, int(unused_top_bits));
+ code.shr(tmp, int(unused_top_bits + page_bits));
+ }
+ } else {
+ code.mov(tmp, vaddr);
+ code.shr(tmp, int(page_bits));
+ code.and_(tmp, u32((1 << valid_page_index_bits) - 1));
+ }
+ } else {
+ ASSERT(valid_page_index_bits < 32);
+ code.mov(tmp, vaddr);
+ code.shr(tmp, int(page_bits));
+ code.test(tmp, u32(-(1 << valid_page_index_bits)));
+ code.jnz(abort, code.T_NEAR);
+ }
+ code.mov(page, qword[r14 + tmp * sizeof(void*)]);
+ if (ctx.conf.page_table_pointer_mask_bits == 0) {
+ code.test(page, page);
+ } else {
+ code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits);
+ }
+ code.jz(abort, code.T_NEAR);
+ if (ctx.conf.absolute_offset_page_table) {
+ return page + vaddr;
+ }
+ code.mov(tmp, vaddr);
+ code.and_(tmp, static_cast<u32>(page_mask));
+ return page + tmp;
+}
+
+template<typename EmitContext>
+Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional<Xbyak::Reg64> tmp = std::nullopt);
+
+template<>
+[[maybe_unused]] Xbyak::RegExp EmitFastmemVAddr<A32EmitContext>(BlockOfCode&, A32EmitContext&, Xbyak::Label&, Xbyak::Reg64 vaddr, bool&, std::optional<Xbyak::Reg64>) {
+ return r13 + vaddr;
+}
+
+template<>
+[[maybe_unused]] Xbyak::RegExp EmitFastmemVAddr<A64EmitContext>(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional<Xbyak::Reg64> tmp) {
+ const size_t unused_top_bits = 64 - ctx.conf.fastmem_address_space_bits;
+
+ if (unused_top_bits == 0) {
+ return r13 + vaddr;
+ } else if (ctx.conf.silently_mirror_fastmem) {
+ if (!tmp) {
+ tmp = ctx.reg_alloc.ScratchGpr();
+ }
+ if (unused_top_bits < 32) {
+ code.mov(*tmp, vaddr);
+ code.shl(*tmp, int(unused_top_bits));
+ code.shr(*tmp, int(unused_top_bits));
+ } else if (unused_top_bits == 32) {
+ code.mov(tmp->cvt32(), vaddr.cvt32());
+ } else {
+ code.mov(tmp->cvt32(), vaddr.cvt32());
+ code.and_(*tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1));
+ }
+ return r13 + *tmp;
+ } else {
+ if (ctx.conf.fastmem_address_space_bits < 32) {
+ code.test(vaddr, u32(-(1 << ctx.conf.fastmem_address_space_bits)));
+ code.jnz(abort, code.T_NEAR);
+ require_abort_handling = true;
+ } else {
+ // TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator
+ if (!tmp) {
+ tmp = ctx.reg_alloc.ScratchGpr();
+ }
+ code.mov(*tmp, vaddr);
+ code.shr(*tmp, int(ctx.conf.fastmem_address_space_bits));
+ code.jnz(abort, code.T_NEAR);
+ require_abort_handling = true;
+ }
+ return r13 + vaddr;
+ }
+}
+
+template<std::size_t bitsize>
+const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) {
+ if (ordered) {
+ if constexpr (bitsize != 128) {
+ code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx});
+ } else {
+ code.xor_(eax, eax);
+ code.xor_(ebx, ebx);
+ code.xor_(ecx, ecx);
+ code.xor_(edx, edx);
+ }
+
+ const void* fastmem_location = code.getCurr();
+ switch (bitsize) {
+ case 8:
+ code.lock();
+ code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8());
+ break;
+ case 16:
+ code.lock();
+ code.xadd(word[addr], Xbyak::Reg16{value_idx});
+ break;
+ case 32:
+ code.lock();
+ code.xadd(dword[addr], Xbyak::Reg32{value_idx});
+ break;
+ case 64:
+ code.lock();
+ code.xadd(qword[addr], Xbyak::Reg64{value_idx});
+ break;
+ case 128:
+ code.lock();
+ code.cmpxchg16b(xword[addr]);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(Xbyak::Xmm{value_idx}, rax);
+ code.pinsrq(Xbyak::Xmm{value_idx}, rdx, 1);
+ } else {
+ code.movq(Xbyak::Xmm{value_idx}, rax);
+ code.movq(xmm0, rdx);
+ code.punpcklqdq(Xbyak::Xmm{value_idx}, xmm0);
+ }
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ return fastmem_location;
+ }
+
+ const void* fastmem_location = code.getCurr();
+ switch (bitsize) {
+ case 8:
+ code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]);
+ break;
+ case 16:
+ code.movzx(Xbyak::Reg32{value_idx}, word[addr]);
+ break;
+ case 32:
+ code.mov(Xbyak::Reg32{value_idx}, dword[addr]);
+ break;
+ case 64:
+ code.mov(Xbyak::Reg64{value_idx}, qword[addr]);
+ break;
+ case 128:
+ code.movups(Xbyak::Xmm{value_idx}, xword[addr]);
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ return fastmem_location;
+}
+
+template<std::size_t bitsize>
+const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx, bool ordered) {
+ if (ordered) {
+ if constexpr (bitsize == 128) {
+ code.xor_(eax, eax);
+ code.xor_(edx, edx);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.movq(rbx, Xbyak::Xmm{value_idx});
+ code.pextrq(rcx, Xbyak::Xmm{value_idx}, 1);
+ } else {
+ code.movaps(xmm0, Xbyak::Xmm{value_idx});
+ code.movq(rbx, xmm0);
+ code.punpckhqdq(xmm0, xmm0);
+ code.movq(rcx, xmm0);
+ }
+ }
+
+ const void* fastmem_location = code.getCurr();
+ switch (bitsize) {
+ case 8:
+ code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8());
+ break;
+ case 16:
+ code.xchg(word[addr], Xbyak::Reg16{value_idx});
+ break;
+ case 32:
+ code.xchg(dword[addr], Xbyak::Reg32{value_idx});
+ break;
+ case 64:
+ code.xchg(qword[addr], Xbyak::Reg64{value_idx});
+ break;
+ case 128: {
+ Xbyak::Label loop;
+ code.L(loop);
+ code.lock();
+ code.cmpxchg16b(xword[addr]);
+ code.jnz(loop);
+ break;
+ }
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ return fastmem_location;
+ }
+
+ const void* fastmem_location = code.getCurr();
+ switch (bitsize) {
+ case 8:
+ code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8());
+ break;
+ case 16:
+ code.mov(word[addr], Xbyak::Reg16{value_idx});
+ break;
+ case 32:
+ code.mov(dword[addr], Xbyak::Reg32{value_idx});
+ break;
+ case 64:
+ code.mov(qword[addr], Xbyak::Reg64{value_idx});
+ break;
+ case 128:
+ code.movups(xword[addr], Xbyak::Xmm{value_idx});
+ break;
+ default:
+ ASSERT_FALSE("Invalid bitsize");
+ }
+ return fastmem_location;
+}
+
+template<typename UserConfig>
+void EmitExclusiveLock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) {
+ if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) {
+ return;
+ }
+
+ code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorLockPointer(conf.global_monitor)));
+ EmitSpinLockLock(code, pointer, tmp);
+}
+
+template<typename UserConfig>
+void EmitExclusiveUnlock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) {
+ if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) {
+ return;
+ }
+
+ code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorLockPointer(conf.global_monitor)));
+ EmitSpinLockUnlock(code, pointer, tmp);
+}
+
+template<typename UserConfig>
+void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 vaddr, Xbyak::Reg64 pointer, Xbyak::Reg64 tmp) {
+ if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) {
+ return;
+ }
+
+ code.mov(tmp, 0xDEAD'DEAD'DEAD'DEAD);
+ const size_t processor_count = GetExclusiveMonitorProcessorCount(conf.global_monitor);
+ for (size_t processor_index = 0; processor_index < processor_count; processor_index++) {
+ if (processor_index == conf.processor_id) {
+ continue;
+ }
+ Xbyak::Label ok;
+ code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, processor_index)));
+ code.cmp(qword[pointer], vaddr);
+ code.jne(ok);
+ code.mov(qword[pointer], tmp);
+ code.L(ok);
+ }
+}
+
+inline bool IsOrdered(IR::AccType acctype) {
+ return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED;
+}
+
+} // namespace
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp
new file mode 100644
index 0000000000..90f06a9015
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp
@@ -0,0 +1,693 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.paddb(xmm_a, xmm_b);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqb(ones, ones);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.pminub(xmm_ge, xmm_b);
+ code.pcmpeqb(xmm_ge, xmm_b);
+ code.pxor(xmm_ge, ones);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqb(xmm0, xmm0);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.paddsb(xmm_ge, xmm_b);
+ code.pcmpgtb(xmm_ge, xmm0);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ code.paddb(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.paddw(xmm_a, xmm_b);
+
+ if (ge_inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqb(ones, ones);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.pminuw(xmm_ge, xmm_b);
+ code.pcmpeqw(xmm_ge, xmm_b);
+ code.pxor(xmm_ge, ones);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ } else {
+ const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+
+ // !(b <= a+b) == b > a+b
+ code.movdqa(tmp_a, xmm_a);
+ code.movdqa(tmp_b, xmm_b);
+ code.paddw(tmp_a, code.Const(xword, 0x80008000));
+ code.paddw(tmp_b, code.Const(xword, 0x80008000));
+ code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison!
+
+ ctx.reg_alloc.DefineValue(ge_inst, tmp_b);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqw(xmm0, xmm0);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.paddsw(xmm_ge, xmm_b);
+ code.pcmpgtw(xmm_ge, xmm0);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ code.paddw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.pmaxub(xmm_ge, xmm_b);
+ code.pcmpeqb(xmm_ge, xmm_a);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ code.psubb(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqb(xmm0, xmm0);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.psubsb(xmm_ge, xmm_b);
+ code.pcmpgtb(xmm_ge, xmm0);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ code.psubb(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ if (!ge_inst) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.psubw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1
+ code.pcmpeqw(xmm_ge, xmm_a);
+
+ code.psubw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ return;
+ }
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
+
+ // (a >= b) == !(b > a)
+ code.pcmpeqb(ones, ones);
+ code.paddw(xmm_a, code.Const(xword, 0x80008000));
+ code.paddw(xmm_b, code.Const(xword, 0x80008000));
+ code.movdqa(xmm_ge, xmm_b);
+ code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison!
+ code.pxor(xmm_ge, ones);
+
+ code.psubw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (ge_inst) {
+ const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqw(xmm0, xmm0);
+
+ code.movdqa(xmm_ge, xmm_a);
+ code.psubsw(xmm_ge, xmm_b);
+ code.pcmpgtw(xmm_ge, xmm0);
+
+ ctx.reg_alloc.DefineValue(ge_inst, xmm_ge);
+ }
+
+ code.psubw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsInXmm() || args[1].IsInXmm()) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm();
+
+ // Since,
+ // pavg(a, b) == (a + b + 1) >> 1
+ // Therefore,
+ // ~pavg(~a, ~b) == (a + b) >> 1
+
+ code.pcmpeqb(ones, ones);
+ code.pxor(xmm_a, ones);
+ code.pxor(xmm_b, ones);
+ code.pavgb(xmm_a, xmm_b);
+ code.pxor(xmm_a, ones);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ } else {
+ const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 and_a_b = reg_a;
+ const Xbyak::Reg32 result = reg_a;
+
+ // This relies on the equality x+y == ((x&y) << 1) + (x^y).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
+ // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
+
+ code.mov(xor_a_b, reg_a);
+ code.and_(and_a_b, reg_b);
+ code.xor_(xor_a_b, reg_b);
+ code.shr(xor_a_b, 1);
+ code.and_(xor_a_b, 0x7F7F7F7F);
+ code.add(result, xor_a_b);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (args[0].IsInXmm() || args[1].IsInXmm()) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, xmm_a);
+ code.pand(xmm_a, xmm_b);
+ code.pxor(tmp, xmm_b);
+ code.psrlw(tmp, 1);
+ code.paddw(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ } else {
+ const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 and_a_b = reg_a;
+ const Xbyak::Reg32 result = reg_a;
+
+ // This relies on the equality x+y == ((x&y) << 1) + (x^y).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
+ // We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below.
+
+ code.mov(xor_a_b, reg_a);
+ code.and_(and_a_b, reg_b);
+ code.xor_(xor_a_b, reg_b);
+ code.shr(xor_a_b, 1);
+ code.and_(xor_a_b, 0x7FFF7FFF);
+ code.add(result, xor_a_b);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 and_a_b = reg_a;
+ const Xbyak::Reg32 result = reg_a;
+ const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // This relies on the equality x+y == ((x&y) << 1) + (x^y).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1).
+ // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below.
+ // carry propagates the sign bit from (x^y)>>1 upwards by one.
+
+ code.mov(xor_a_b, reg_a);
+ code.and_(and_a_b, reg_b);
+ code.xor_(xor_a_b, reg_b);
+ code.mov(carry, xor_a_b);
+ code.and_(carry, 0x80808080);
+ code.shr(xor_a_b, 1);
+ code.and_(xor_a_b, 0x7F7F7F7F);
+ code.add(result, xor_a_b);
+ code.xor_(result, carry);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // This relies on the equality x+y == ((x&y) << 1) + (x^y).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1).
+ // The arithmetic shift right makes this signed.
+
+ code.movdqa(tmp, xmm_a);
+ code.pand(xmm_a, xmm_b);
+ code.pxor(tmp, xmm_b);
+ code.psraw(tmp, 1);
+ code.paddw(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
+
+ code.xor_(minuend, subtrahend);
+ code.and_(subtrahend, minuend);
+ code.shr(minuend, 1);
+
+ // At this point,
+ // minuend := (a^b) >> 1
+ // subtrahend := (a^b) & b
+
+ // We must now perform a partitioned subtraction.
+ // We can do this because minuend contains 7 bit fields.
+ // We use the extra bit in minuend as a bit to borrow from; we set this bit.
+ // We invert this bit at the end as this tells us if that bit was borrowed from.
+ code.or_(minuend, 0x80808080);
+ code.sub(minuend, subtrahend);
+ code.xor_(minuend, 0x80808080);
+
+ // minuend now contains the desired result.
+ ctx.reg_alloc.DefineValue(inst, minuend);
+}
+
+void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+
+ const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
+
+ code.xor_(minuend, subtrahend);
+ code.and_(subtrahend, minuend);
+ code.mov(carry, minuend);
+ code.and_(carry, 0x80808080);
+ code.shr(minuend, 1);
+
+ // At this point,
+ // minuend := (a^b) >> 1
+ // subtrahend := (a^b) & b
+ // carry := (a^b) & 0x80808080
+
+ // We must now perform a partitioned subtraction.
+ // We can do this because minuend contains 7 bit fields.
+ // We use the extra bit in minuend as a bit to borrow from; we set this bit.
+ // We invert this bit at the end as this tells us if that bit was borrowed from.
+ // We then sign extend the result into this bit.
+ code.or_(minuend, 0x80808080);
+ code.sub(minuend, subtrahend);
+ code.xor_(minuend, 0x80808080);
+ code.xor_(minuend, carry);
+
+ ctx.reg_alloc.DefineValue(inst, minuend);
+}
+
+void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y).
+
+ code.pxor(minuend, subtrahend);
+ code.pand(subtrahend, minuend);
+ code.psrlw(minuend, 1);
+
+ // At this point,
+ // minuend := (a^b) >> 1
+ // subtrahend := (a^b) & b
+
+ code.psubw(minuend, subtrahend);
+
+ ctx.reg_alloc.DefineValue(inst, minuend);
+}
+
+void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1).
+ // Note that x^y always contains the LSB of the result.
+ // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y).
+
+ code.pxor(minuend, subtrahend);
+ code.pand(subtrahend, minuend);
+ code.psraw(minuend, 1);
+
+ // At this point,
+ // minuend := (a^b) >>> 1
+ // subtrahend := (a^b) & b
+
+ code.psubw(minuend, subtrahend);
+
+ ctx.reg_alloc.DefineValue(inst, minuend);
+}
+
+static void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+ const Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+ const Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32();
+ Xbyak::Reg32 reg_sum, reg_diff;
+
+ if (is_signed) {
+ code.movsx(reg_a_lo, reg_a_hi.cvt16());
+ code.movsx(reg_b_lo, reg_b_hi.cvt16());
+ code.sar(reg_a_hi, 16);
+ code.sar(reg_b_hi, 16);
+ } else {
+ code.movzx(reg_a_lo, reg_a_hi.cvt16());
+ code.movzx(reg_b_lo, reg_b_hi.cvt16());
+ code.shr(reg_a_hi, 16);
+ code.shr(reg_b_hi, 16);
+ }
+
+ if (hi_is_sum) {
+ code.sub(reg_a_lo, reg_b_hi);
+ code.add(reg_a_hi, reg_b_lo);
+ reg_diff = reg_a_lo;
+ reg_sum = reg_a_hi;
+ } else {
+ code.add(reg_a_lo, reg_b_hi);
+ code.sub(reg_a_hi, reg_b_lo);
+ reg_diff = reg_a_hi;
+ reg_sum = reg_a_lo;
+ }
+
+ if (ge_inst) {
+ // The reg_b registers are no longer required.
+ const Xbyak::Reg32 ge_sum = reg_b_hi;
+ const Xbyak::Reg32 ge_diff = reg_b_lo;
+
+ code.mov(ge_sum, reg_sum);
+ code.mov(ge_diff, reg_diff);
+
+ if (!is_signed) {
+ code.shl(ge_sum, 15);
+ code.sar(ge_sum, 31);
+ } else {
+ code.not_(ge_sum);
+ code.sar(ge_sum, 31);
+ }
+ code.not_(ge_diff);
+ code.sar(ge_diff, 31);
+ code.and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
+ code.and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
+ code.or_(ge_sum, ge_diff);
+
+ ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
+ }
+
+ if (is_halving) {
+ code.shl(reg_a_lo, 15);
+ code.shr(reg_a_hi, 1);
+ } else {
+ code.shl(reg_a_lo, 16);
+ }
+
+ // reg_a_lo now contains the low word and reg_a_hi now contains the high word.
+ // Merge them.
+ code.shld(reg_a_hi, reg_a_lo, 16);
+
+ ctx.reg_alloc.DefineValue(inst, reg_a_hi);
+}
+
+void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, true, false, false);
+}
+
+void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, true, true, false);
+}
+
+void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, false, false, false);
+}
+
+void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, false, true, false);
+}
+
+void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, true, false, true);
+}
+
+void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, true, true, true);
+}
+
+void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, false, false, true);
+}
+
+void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedSubAdd(code, ctx, inst, false, true, true);
+}
+
+static void EmitPackedOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ (code.*fn)(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb);
+}
+
+void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb);
+}
+
+void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb);
+}
+
+void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb);
+}
+
+void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw);
+}
+
+void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw);
+}
+
+void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw);
+}
+
+void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw);
+}
+
+void EmitX64::EmitPackedAbsDiffSumU8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // TODO: Optimize with zero-extension detection
+ code.movaps(tmp, code.Const(xword, 0x0000'0000'ffff'ffff));
+ code.pand(xmm_a, tmp);
+ code.pand(xmm_b, tmp);
+ code.psadbw(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm();
+
+ if (num_args_in_xmm >= 2) {
+ const Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]);
+
+ code.pand(from, ge);
+ code.pandn(ge, to);
+ code.por(from, ge);
+
+ ctx.reg_alloc.DefineValue(inst, from);
+ } else if (code.HasHostFeature(HostFeature::BMI1)) {
+ const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+ const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
+
+ code.and_(from, ge);
+ code.andn(to, ge, to);
+ code.or_(from, to);
+
+ ctx.reg_alloc.DefineValue(inst, from);
+ } else {
+ const Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+ const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
+
+ code.and_(from, ge);
+ code.not_(ge);
+ code.and_(ge, to);
+ code.or_(from, ge);
+
+ ctx.reg_alloc.DefineValue(inst, from);
+ }
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp
new file mode 100644
index 0000000000..24fb895b61
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp
@@ -0,0 +1,303 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <limits>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/integer_of_size.hpp>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+namespace {
+
+enum class Op {
+ Add,
+ Sub,
+};
+
+template<Op op, size_t size, bool has_overflow_inst = false>
+void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size);
+ Xbyak::Reg addend = ctx.reg_alloc.UseGpr(args[1]).changeBit(size);
+ Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr().changeBit(size);
+
+ constexpr u64 int_max = static_cast<u64>(std::numeric_limits<mcl::signed_integer_of_size<size>>::max());
+ if constexpr (size < 64) {
+ code.xor_(overflow.cvt32(), overflow.cvt32());
+ code.bt(result.cvt32(), size - 1);
+ code.adc(overflow.cvt32(), int_max);
+ } else {
+ code.mov(overflow, int_max);
+ code.bt(result, 63);
+ code.adc(overflow, 0);
+ }
+
+ // overflow now contains 0x7F... if a was positive, or 0x80... if a was negative
+
+ if constexpr (op == Op::Add) {
+ code.add(result, addend);
+ } else {
+ code.sub(result, addend);
+ }
+
+ if constexpr (size == 8) {
+ code.cmovo(result.cvt32(), overflow.cvt32());
+ } else {
+ code.cmovo(result, overflow);
+ }
+
+ code.seto(overflow.cvt8());
+ if constexpr (has_overflow_inst) {
+ if (const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp)) {
+ ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+ }
+ } else {
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<Op op, size_t size>
+void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ Xbyak::Reg op_result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size);
+ Xbyak::Reg addend = ctx.reg_alloc.UseScratchGpr(args[1]).changeBit(size);
+
+ constexpr u64 boundary = op == Op::Add ? std::numeric_limits<mcl::unsigned_integer_of_size<size>>::max() : 0;
+
+ if constexpr (op == Op::Add) {
+ code.add(op_result, addend);
+ } else {
+ code.sub(op_result, addend);
+ }
+ code.mov(addend, boundary);
+ if constexpr (size == 8) {
+ code.cmovae(addend.cvt32(), op_result.cvt32());
+ } else {
+ code.cmovae(addend, op_result);
+ }
+
+ const Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr();
+ code.setb(overflow.cvt8());
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, addend);
+}
+
+} // anonymous namespace
+
+void EmitX64::EmitSignedSaturatedAddWithFlag32(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Add, 32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedSubWithFlag32(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Sub, 32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const size_t N = args[1].GetImmediateU8();
+ ASSERT(N >= 1 && N <= 32);
+
+ if (N == 32) {
+ if (overflow_inst) {
+ const auto no_overflow = IR::Value(false);
+ overflow_inst->ReplaceUsesWith(no_overflow);
+ }
+ // TODO: DefineValue directly on Argument
+ const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+ code.mov(result.cvt32(), source.cvt32());
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const u32 mask = (1u << N) - 1;
+ const u32 positive_saturated_value = (1u << (N - 1)) - 1;
+ const u32 negative_saturated_value = 1u << (N - 1);
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value.
+ code.lea(overflow, code.ptr[reg_a.cvt64() + negative_saturated_value]);
+
+ // Put the appropriate saturated value in result
+ code.mov(result, reg_a);
+ code.sar(result, 31);
+ code.xor_(result, positive_saturated_value);
+
+ // Do the saturation
+ code.cmp(overflow, mask);
+ code.cmovbe(result, reg_a);
+
+ if (overflow_inst) {
+ code.seta(overflow.cvt8());
+
+ ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) {
+ const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const size_t N = args[1].GetImmediateU8();
+ ASSERT(N <= 31);
+
+ const u32 saturated_value = (1u << N) - 1;
+
+ const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+ const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+ const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ // Pseudocode: result = clamp(reg_a, 0, saturated_value);
+ code.xor_(overflow, overflow);
+ code.cmp(reg_a, saturated_value);
+ code.mov(result, saturated_value);
+ code.cmovle(result, overflow);
+ code.cmovbe(result, reg_a);
+
+ if (overflow_inst) {
+ code.seta(overflow.cvt8());
+
+ ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Add, 8>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Add, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Add, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Add, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg32 x = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+ const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.movsx(x, x.cvt16());
+ code.movsx(y, y.cvt16());
+
+ code.imul(x, y);
+ code.lea(y, ptr[x.cvt64() + x.cvt64()]);
+ code.mov(tmp, x);
+ code.shr(tmp, 15);
+ code.xor_(y, x);
+ code.mov(y, 0x7FFF);
+ code.cmovns(y, tmp);
+
+ code.sets(tmp.cvt8());
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Reg64 x = ctx.reg_alloc.UseScratchGpr(args[0]);
+ const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]);
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+
+ code.movsxd(x, x.cvt32());
+ code.movsxd(y, y.cvt32());
+
+ code.imul(x, y);
+ code.lea(y, ptr[x + x]);
+ code.mov(tmp, x);
+ code.shr(tmp, 31);
+ code.xor_(y, x);
+ code.mov(y.cvt32(), 0x7FFFFFFF);
+ code.cmovns(y.cvt32(), tmp.cvt32());
+
+ code.sets(tmp.cvt8());
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Sub, 8>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Sub, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Sub, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitSignedSaturatedOp<Op::Sub, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Add, 8>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Add, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Add, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Add, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Sub, 8>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Sub, 16>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Sub, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitUnsignedSaturatedOp<Op::Sub, 64>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
new file mode 100644
index 0000000000..92e0841d8b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
@@ -0,0 +1,81 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const bool part1 = args[3].GetImmediateU1();
+
+ ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+ // 3 2 1 0
+ // x = d c b a
+ // y = h g f e
+ // w = wk3 wk2 wk1 wk0
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]);
+
+ // x64 expects:
+ // 3 2 1 0
+ // src1 = c d g h
+ // src2 = a b e f
+ // xmm0 = - - wk1 wk0
+
+ code.movaps(xmm0, y);
+ code.shufps(xmm0, x, 0b10111011); // src1
+ code.shufps(y, x, 0b00010001); // src2
+ code.movaps(x, xmm0);
+
+ code.movaps(xmm0, w);
+ code.sha256rnds2(x, y);
+
+ code.punpckhqdq(xmm0, xmm0);
+ code.sha256rnds2(y, x);
+
+ code.shufps(y, x, part1 ? 0b10111011 : 0b00010001);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.sha256msg1(x, y);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]);
+
+ code.movaps(xmm0, z);
+ code.palignr(xmm0, y, 4);
+ code.paddd(x, xmm0);
+ code.sha256msg2(x, z);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp
new file mode 100644
index 0000000000..b084a92f91
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp
@@ -0,0 +1,21 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/crypto/sm4.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Backend::X64 {
+
+void EmitX64::EmitSM4AccessSubstitutionBox(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ ctx.reg_alloc.HostCall(inst, args[0]);
+ code.CallFunction(&Common::Crypto::SM4::AccessSubstitutionBox);
+ code.movzx(code.ABI_RETURN.cvt32(), code.ABI_RETURN.cvt8());
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
new file mode 100644
index 0000000000..47180de04d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -0,0 +1,5929 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <algorithm>
+#include <bit>
+#include <bitset>
+#include <cstdlib>
+#include <type_traits>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+#include <mcl/type_traits/function_info.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/constants.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/math_util.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+template<typename Function>
+static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ (code.*fn)(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+template<typename Function>
+static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ (code.*fn)(xmm_a, xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+template<typename Lambda>
+static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+ constexpr u32 stack_space = 2 * 16;
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+ constexpr u32 stack_space = 2 * 16;
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+ constexpr u32 stack_space = 3 * 16;
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.movaps(xword[code.ABI_PARAM3], arg2);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+ constexpr u32 stack_space = 2 * 16;
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const u8 arg2 = args[1].GetImmediateU8();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.mov(code.ABI_PARAM3, arg2);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8());
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+ constexpr u32 stack_space = 3 * 16;
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+
+ ctx.reg_alloc.HostCall(nullptr);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.movaps(xword[code.ABI_PARAM3], arg2);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ // TODO: DefineValue directly on Argument for index == 0
+
+ const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pextrb(dest, source, index);
+ } else {
+ code.pextrw(dest, source, u8(index / 2));
+ if (index % 2 == 1) {
+ code.shr(dest, 8);
+ } else {
+ code.and_(dest, 0xFF); // TODO: Remove when zext handling is corrected
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ // TODO: DefineValue directly on Argument for index == 0
+
+ const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pextrw(dest, source, index);
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ // TODO: DefineValue directly on Argument for index == 0
+
+ const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
+ code.pextrd(dest, source, index);
+ } else {
+ const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.pshufd(source, source, index);
+ code.movd(dest, source);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ if (index == 0) {
+ // TODO: DefineValue directly on Argument for index == 0
+ const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64();
+ const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
+ code.movq(dest, source);
+ ctx.reg_alloc.DefineValue(inst, dest);
+ return;
+ }
+
+ const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64();
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]);
+ code.pextrq(dest, source, 1);
+ } else {
+ const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.punpckhqdq(source, source);
+ code.movq(dest, source);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt8();
+
+ code.pinsrb(source_vector, source_elem.cvt32(), index);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ } else {
+ const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
+ const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.pextrw(tmp, source_vector, index / 2);
+ if (index % 2 == 0) {
+ code.and_(tmp, 0xFF00);
+ code.and_(source_elem, 0x00FF);
+ code.or_(tmp, source_elem);
+ } else {
+ code.and_(tmp, 0x00FF);
+ code.shl(source_elem, 8);
+ code.or_(tmp, source_elem);
+ }
+ code.pinsrw(source_vector, tmp, index / 2);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ }
+}
+
+void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+
+ const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt16();
+
+ code.pinsrw(source_vector, source_elem.cvt32(), index);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+}
+
+void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt32();
+
+ code.pinsrd(source_vector, source_elem, index);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ } else {
+ const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32();
+
+ code.pinsrw(source_vector, source_elem, index * 2);
+ code.shr(source_elem, 16);
+ code.pinsrw(source_vector, source_elem, index * 2 + 1);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ }
+}
+
+void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]);
+
+ code.pinsrq(source_vector, source_elem, index);
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ } else {
+ const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movq(tmp, source_elem);
+
+ if (index == 0) {
+ code.movsd(source_vector, tmp);
+ } else {
+ code.punpcklqdq(source_vector, tmp);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, source_vector);
+ }
+}
+
+static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsb(data, data);
+ } else {
+ const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+ code.pxor(temp, temp);
+ code.psubb(temp, data);
+ code.pminub(data, temp);
+ }
+}
+
+static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsw(data, data);
+ } else {
+ const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+ code.pxor(temp, temp);
+ code.psubw(temp, data);
+ code.pmaxsw(data, temp);
+ }
+}
+
+static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsd(data, data);
+ } else {
+ const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(temp, data);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubd(data, temp);
+ }
+}
+
+static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpabsq(data, data);
+ } else {
+ const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+ code.pshufd(temp, data, 0b11110101);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubq(data, temp);
+ }
+}
+
+static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ switch (esize) {
+ case 8:
+ VectorAbs8(code, ctx, data);
+ break;
+ case 16:
+ VectorAbs16(code, ctx, data);
+ break;
+ case 32:
+ VectorAbs32(code, ctx, data);
+ break;
+ case 64:
+ VectorAbs64(code, ctx, data);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorAbs8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorAbs(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorAbs(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorAbs(32, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorAbs64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorAbs(64, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorAdd8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddb);
+}
+
+void EmitX64::EmitVectorAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddw);
+}
+
+void EmitX64::EmitVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddd);
+}
+
+void EmitX64::EmitVectorAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddq);
+}
+
+void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand);
+}
+
+void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.pandn(xmm_b, xmm_a);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_b);
+}
+
+static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
+ if (code.HasHostFeature(HostFeature::GFNI)) {
+ const u64 shift_matrix = shift_amount < 8
+ ? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
+ : 0x8080808080808080;
+ code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
+ return;
+ }
+
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.punpckhbw(tmp, result);
+ code.punpcklbw(result, result);
+ code.psraw(tmp, 8 + shift_amount);
+ code.psraw(result, 8 + shift_amount);
+ code.packsswb(result, tmp);
+}
+
+void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ ArithmeticShiftRightByte(ctx, code, result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psraw(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psrad(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63));
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpsraq(result, result, shift_amount);
+ } else {
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ const u64 sign_bit = 0x80000000'00000000u >> shift_amount;
+
+ code.pxor(tmp2, tmp2);
+ code.psrlq(result, shift_amount);
+ code.movdqa(tmp1, code.Const(xword, sign_bit, sign_bit));
+ code.pand(tmp1, result);
+ code.psubq(tmp2, tmp1);
+ code.por(result, tmp2);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename T>
+static constexpr T VShift(T x, T y) {
+ const s8 shift_amount = static_cast<s8>(static_cast<u8>(y));
+ const s64 bit_size = static_cast<s64>(mcl::bitsizeof<T>);
+
+ if constexpr (std::is_signed_v<T>) {
+ if (shift_amount >= bit_size) {
+ return 0;
+ }
+
+ if (shift_amount <= -bit_size) {
+ // Parentheses necessary, as MSVC doesn't appear to consider cast parentheses
+ // as a grouping in terms of precedence, causing warning C4554 to fire. See:
+ // https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html
+ return x >> (T(bit_size - 1));
+ }
+ } else if (shift_amount <= -bit_size || shift_amount >= bit_size) {
+ return 0;
+ }
+
+ if (shift_amount < 0) {
+ return x >> T(-shift_amount);
+ }
+
+ using unsigned_type = std::make_unsigned_t<T>;
+ return static_cast<T>(static_cast<unsigned_type>(x) << static_cast<unsigned_type>(shift_amount));
+}
+
+void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s8>);
+ });
+}
+
+void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = xmm16;
+ const Xbyak::Xmm tmp = xmm17;
+
+ code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.vpxord(right_shift, right_shift, right_shift);
+ code.vpsubw(right_shift, right_shift, left_shift);
+
+ code.vpsllw(xmm0, left_shift, 8);
+ code.vpsraw(xmm0, xmm0, 15);
+
+ const Xbyak::Opmask mask = k1;
+ code.vpmovb2m(mask, xmm0);
+
+ code.vpandd(right_shift, right_shift, tmp);
+ code.vpandd(left_shift, left_shift, tmp);
+
+ code.vpsravw(tmp, result, right_shift);
+ code.vpsllvw(result, result, left_shift);
+ code.vpblendmb(result | mask, result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s16>);
+ });
+}
+
+void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF));
+ code.vpxor(right_shift, right_shift, right_shift);
+ code.vpsubd(right_shift, right_shift, left_shift);
+
+ code.vpslld(xmm0, left_shift, 24);
+
+ code.vpand(right_shift, right_shift, tmp);
+ code.vpand(left_shift, left_shift, tmp);
+
+ code.vpsravd(tmp, result, right_shift);
+ code.vpsllvd(result, result, left_shift);
+ code.blendvps(result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s32>);
+ });
+}
+
+void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = xmm16;
+ const Xbyak::Xmm tmp = xmm17;
+
+ code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF));
+ code.vpxorq(right_shift, right_shift, right_shift);
+ code.vpsubq(right_shift, right_shift, left_shift);
+
+ code.vpsllq(xmm0, left_shift, 56);
+ const Xbyak::Opmask mask = k1;
+ code.vpmovq2m(mask, xmm0);
+
+ code.vpandq(right_shift, right_shift, tmp);
+ code.vpandq(left_shift, left_shift, tmp);
+
+ code.vpsravq(tmp, result, right_shift);
+ code.vpsllvq(result, result, left_shift);
+ code.vpblendmq(result | mask, result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s64>);
+ });
+}
+
+void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastb(a, a);
+ code.vmovq(a, a);
+ } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(tmp, tmp);
+ code.pshufb(a, tmp);
+ code.movq(a, a);
+ } else {
+ code.punpcklbw(a, a);
+ code.pshuflw(a, a, 0);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.pshuflw(a, a, 0);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.pshuflw(a, a, 0b01000100);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastb(a, a);
+ } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(tmp, tmp);
+ code.pshufb(a, tmp);
+ } else {
+ code.punpcklbw(a, a);
+ code.pshuflw(a, a, 0);
+ code.punpcklqdq(a, a);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastw(a, a);
+ } else {
+ code.pshuflw(a, a, 0);
+ code.punpcklqdq(a, a);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastd(a, a);
+ } else {
+ code.pshufd(a, a, 0);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastq(a, a);
+ } else {
+ code.punpcklqdq(a, a);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 16);
+
+ if (index > 0) {
+ code.psrldq(a, index);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastb(a, a);
+ code.vmovq(a, a);
+ } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(tmp, tmp);
+ code.pshufb(a, tmp);
+ code.movq(a, a);
+ } else {
+ code.punpcklbw(a, a);
+ code.pshuflw(a, a, 0);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 8);
+
+ if (index > 0) {
+ code.psrldq(a, u8(index * 2));
+ }
+
+ code.pshuflw(a, a, 0);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 4);
+
+ if (index > 0) {
+ code.psrldq(a, u8(index * 4));
+ }
+
+ code.pshuflw(a, a, 0b01'00'01'00);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 16);
+
+ if (index > 0) {
+ code.psrldq(a, index);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastb(a, a);
+ } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(tmp, tmp);
+ code.pshufb(a, tmp);
+ } else {
+ code.punpcklbw(a, a);
+ code.pshuflw(a, a, 0);
+ code.punpcklqdq(a, a);
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 8);
+
+ if (index == 0 && code.HasHostFeature(HostFeature::AVX2)) {
+ code.vpbroadcastw(a, a);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ return;
+ }
+
+ if (index < 4) {
+ code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index));
+ code.punpcklqdq(a, a);
+ } else {
+ code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4)));
+ code.punpckhqdq(a, a);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 4);
+
+ code.pshufd(a, a, mcl::bit::replicate_element<2, u8>(index));
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ ASSERT(args[1].IsImmediate());
+ const u8 index = args[1].GetImmediateU8();
+ ASSERT(index < 2);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpermilpd(a, a, mcl::bit::replicate_element<1, u8>(index));
+ } else {
+ if (index == 0) {
+ code.punpcklqdq(a, a);
+ } else {
+ code.punpckhqdq(a, a);
+ }
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+template<typename T>
+static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArray<T>& data) {
+ for (size_t i = 0; i < result.size(); i++) {
+ T element = data[i];
+
+ size_t count = mcl::bitsizeof<T>;
+ while (element != 0) {
+ element >>= 1;
+ --count;
+ }
+
+ result[i] = static_cast<T>(count);
+ }
+}
+
+void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::GFNI)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ // Reverse bits:
+ code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0);
+
+ // Perform a tzcnt:
+ // Isolate lowest set bit
+ code.pcmpeqb(result, result);
+ code.paddb(result, data);
+ code.pandn(result, data);
+ // Convert lowest set bit into an index
+ code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000));
+ code.movdqa(tmp2, tmp1);
+
+ code.pshufb(tmp2, data);
+ code.psrlw(data, 4);
+ code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
+ code.pshufb(tmp1, data);
+
+ code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404));
+
+ code.pcmpeqb(data, tmp1);
+ code.pand(data, tmp2);
+ code.paddb(data, tmp1);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
+}
+
+void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vpsrlw(tmp, data, 1);
+ code.vpor(data, data, tmp);
+ code.vpsrlw(tmp, data, 2);
+ code.vpor(data, data, tmp);
+ code.vpsrlw(tmp, data, 4);
+ code.vpor(data, data, tmp);
+ code.vpsrlw(tmp, data, 8);
+ code.vpor(data, data, tmp);
+ code.vpcmpeqw(zeros, zeros, zeros);
+ code.vpcmpeqw(tmp, tmp, tmp);
+ code.vpcmpeqw(zeros, zeros, data);
+ code.vpmullw(data, data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
+ code.vpsllw(tmp, tmp, 15);
+ code.vpsllw(zeros, zeros, 7);
+ code.vpsrlw(data, data, 12);
+ code.vmovdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
+ code.vpor(tmp, tmp, zeros);
+ code.vpor(data, data, tmp);
+ code.vpshufb(result, result, data);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, data);
+ code.psrlw(tmp, 1);
+ code.por(data, tmp);
+ code.movdqa(tmp, data);
+ code.psrlw(tmp, 2);
+ code.por(data, tmp);
+ code.movdqa(tmp, data);
+ code.psrlw(tmp, 4);
+ code.por(data, tmp);
+ code.movdqa(tmp, data);
+ code.psrlw(tmp, 8);
+ code.por(data, tmp);
+ code.pcmpeqw(zeros, zeros);
+ code.pcmpeqw(tmp, tmp);
+ code.pcmpeqw(zeros, data);
+ code.pmullw(data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3));
+ code.psllw(tmp, 15);
+ code.psllw(zeros, 7);
+ code.psrlw(data, 12);
+ code.movdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01));
+ code.por(tmp, zeros);
+ code.por(data, tmp);
+ code.pshufb(result, data);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
+}
+
+void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vplzcntd(data, data);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
+}
+
+void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.pand(lhs, tmp);
+ code.pand(rhs, tmp);
+ code.packuswb(lhs, rhs);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+ code.pxor(zero, zero);
+
+ code.pblendw(lhs, zero, 0b10101010);
+ code.pblendw(rhs, zero, 0b10101010);
+ code.packusdw(lhs, rhs);
+ } else {
+ code.pslld(lhs, 16);
+ code.psrad(lhs, 16);
+
+ code.pslld(rhs, 16);
+ code.psrad(rhs, 16);
+
+ code.packssdw(lhs, rhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.shufps(lhs, rhs, 0b10001000);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.shufpd(lhs, rhs, 0b00);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.punpcklbw(lhs, rhs);
+ code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
+ } else {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.pand(lhs, tmp);
+ code.pand(rhs, tmp);
+ code.packuswb(lhs, rhs);
+ code.pshufd(lhs, lhs, 0b11011000);
+ code.movq(lhs, lhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.punpcklwd(lhs, rhs);
+ code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
+ } else {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.pslld(lhs, 16);
+ code.psrad(lhs, 16);
+
+ code.pslld(rhs, 16);
+ code.psrad(rhs, 16);
+
+ code.packssdw(lhs, rhs);
+ code.pshufd(lhs, lhs, 0b11011000);
+ code.movq(lhs, lhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ // copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes
+ code.insertps(lhs, rhs, 0b00011100);
+ } else {
+ code.unpcklps(lhs, rhs);
+ code.movq(lhs, lhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.psraw(lhs, 8);
+ code.psraw(rhs, 8);
+ code.packsswb(lhs, rhs);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.psrad(lhs, 16);
+ code.psrad(rhs, 16);
+ code.packssdw(lhs, rhs);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.shufps(lhs, rhs, 0b11011101);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.shufpd(lhs, rhs, 0b11);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.punpcklbw(lhs, rhs);
+ code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
+ } else {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.psraw(lhs, 8);
+ code.psraw(rhs, 8);
+ code.packsswb(lhs, rhs);
+ code.pshufd(lhs, lhs, 0b11011000);
+ code.movq(lhs, lhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.punpcklwd(lhs, rhs);
+ code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
+ } else {
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.psrad(lhs, 16);
+ code.psrad(rhs, 16);
+ code.packssdw(lhs, rhs);
+ code.pshufd(lhs, lhs, 0b11011000);
+ code.movq(lhs, lhs);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+}
+
+void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ // copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes
+ code.insertps(rhs, lhs, 0b01001100);
+
+ ctx.reg_alloc.DefineValue(inst, rhs);
+ } else {
+ const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+
+ code.xorps(zero, zero);
+ code.unpcklps(lhs, rhs);
+ code.unpckhpd(lhs, zero);
+
+ ctx.reg_alloc.DefineValue(inst, lhs);
+ }
+}
+
+void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pxor);
+}
+
+void EmitX64::EmitVectorEqual8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqb);
+}
+
+void EmitX64::EmitVectorEqual16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqw);
+}
+
+void EmitX64::EmitVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqd);
+}
+
+void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqd(xmm_a, xmm_b);
+ code.pshufd(tmp, xmm_a, 0b10110001);
+ code.pand(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqq(xmm_a, xmm_b);
+ code.pshufd(tmp, xmm_a, 0b01001110);
+ code.pand(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ } else {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.pcmpeqd(xmm_a, xmm_b);
+ code.pshufd(tmp, xmm_a, 0b10110001);
+ code.pand(xmm_a, tmp);
+ code.pshufd(tmp, xmm_a, 0b01001110);
+ code.pand(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ }
+}
+
+void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const u8 position = args[2].GetImmediateU8();
+ ASSERT(position % 8 == 0);
+
+ if (position == 0) {
+ ctx.reg_alloc.DefineValue(inst, args[0]);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.palignr(xmm_b, xmm_a, position / 8);
+ ctx.reg_alloc.DefineValue(inst, xmm_b);
+ return;
+ }
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.psrldq(xmm_a, position / 8);
+ code.pslldq(xmm_b, (128 - position) / 8);
+ code.por(xmm_a, xmm_b);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ const u8 position = args[2].GetImmediateU8();
+ ASSERT(position % 8 == 0);
+
+ if (position != 0) {
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.punpcklqdq(xmm_a, xmm_b);
+ code.psrldq(xmm_a, position / 8);
+ }
+ code.movq(xmm_a, xmm_a);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorGreaterS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtb);
+}
+
+void EmitX64::EmitVectorGreaterS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtw);
+}
+
+void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtd);
+}
+
+void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE42)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
+ for (size_t i = 0; i < result.size(); ++i) {
+ result[i] = (a[i] > b[i]) ? ~u64(0) : 0;
+ }
+ });
+}
+
+static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, b);
+ code.pand(tmp, a);
+ code.pxor(a, b);
+
+ switch (esize) {
+ case 8:
+ ArithmeticShiftRightByte(ctx, code, a, 1);
+ code.paddb(a, tmp);
+ break;
+ case 16:
+ code.psraw(a, 1);
+ code.paddw(a, tmp);
+ break;
+ case 32:
+ code.psrad(a, 1);
+ code.paddd(a, tmp);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddSigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddSigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddSigned(32, ctx, inst, code);
+}
+
+static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, b);
+
+ switch (esize) {
+ case 8:
+ code.pavgb(tmp, a);
+ code.pxor(a, b);
+ code.pand(a, code.Const(xword, 0x0101010101010101, 0x0101010101010101));
+ code.psubb(tmp, a);
+ break;
+ case 16:
+ code.pavgw(tmp, a);
+ code.pxor(a, b);
+ code.pand(a, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
+ code.psubw(tmp, a);
+ break;
+ case 32:
+ code.pand(tmp, a);
+ code.pxor(a, b);
+ code.psrld(a, 1);
+ code.paddd(tmp, a);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, tmp);
+}
+
+void EmitX64::EmitVectorHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddUnsigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddUnsigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingAddUnsigned(32, ctx, inst, code);
+}
+
+static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ switch (esize) {
+ case 8: {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
+ code.pxor(a, tmp);
+ code.pxor(b, tmp);
+ code.pavgb(b, a);
+ code.psubb(a, b);
+ break;
+ }
+ case 16: {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
+ code.pxor(a, tmp);
+ code.pxor(b, tmp);
+ code.pavgw(b, a);
+ code.psubw(a, b);
+ break;
+ }
+ case 32:
+ code.pxor(a, b);
+ code.pand(b, a);
+ code.psrad(a, 1);
+ code.psubd(a, b);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubSigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubSigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubSigned(32, ctx, inst, code);
+}
+
+static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ switch (esize) {
+ case 8:
+ code.pavgb(b, a);
+ code.psubb(a, b);
+ break;
+ case 16:
+ code.pavgw(b, a);
+ code.psubw(a, b);
+ break;
+ case 32:
+ code.pxor(a, b);
+ code.pand(b, a);
+ code.psrld(a, 1);
+ code.psubd(a, b);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubUnsigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubUnsigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorHalvingSubUnsigned(32, ctx, inst, code);
+}
+
+static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ switch (size) {
+ case 8:
+ code.punpcklbw(a, b);
+ break;
+ case 16:
+ code.punpcklwd(a, b);
+ break;
+ case 32:
+ code.punpckldq(a, b);
+ break;
+ case 64:
+ code.punpcklqdq(a, b);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorInterleaveLower8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveLower(code, ctx, inst, 8);
+}
+
+void EmitX64::EmitVectorInterleaveLower16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveLower(code, ctx, inst, 16);
+}
+
+void EmitX64::EmitVectorInterleaveLower32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveLower(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveLower(code, ctx, inst, 64);
+}
+
+static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ switch (size) {
+ case 8:
+ code.punpckhbw(a, b);
+ break;
+ case 16:
+ code.punpckhwd(a, b);
+ break;
+ case 32:
+ code.punpckhdq(a, b);
+ break;
+ case 64:
+ code.punpckhqdq(a, b);
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorInterleaveUpper8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveUpper(code, ctx, inst, 8);
+}
+
+void EmitX64::EmitVectorInterleaveUpper16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveUpper(code, ctx, inst, 16);
+}
+
+void EmitX64::EmitVectorInterleaveUpper32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveUpper(code, ctx, inst, 32);
+}
+
+void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorInterleaveUpper(code, ctx, inst, 64);
+}
+
+void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ if (shift_amount == 0) {
+ // do nothing
+ } else if (shift_amount >= 8) {
+ code.pxor(result, result);
+ } else if (shift_amount == 1) {
+ code.paddb(result, result);
+ } else if (code.HasHostFeature(HostFeature::GFNI)) {
+ const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
+ code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
+ } else {
+ const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
+ const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
+
+ code.psllw(result, shift_amount);
+ code.pand(result, code.Const(xword, mask, mask));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psllw(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.pslld(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psllq(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ if (shift_amount == 0) {
+ // Do nothing
+ } else if (shift_amount >= 8) {
+ code.pxor(result, result);
+ } else if (code.HasHostFeature(HostFeature::GFNI)) {
+ const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
+ code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0);
+ } else {
+ const u64 replicand = 0xFEULL >> shift_amount;
+ const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand);
+
+ code.psrlw(result, shift_amount);
+ code.pand(result, code.Const(xword, mask, mask));
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psrlw(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psrld(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const u8 shift_amount = args[1].GetImmediateU8();
+
+ code.psrlq(result, shift_amount);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ const Xbyak::Opmask negative_mask = k1;
+ code.pxor(tmp, tmp);
+ code.vpcmpb(negative_mask, left_shift, tmp, CmpInt::LessThan);
+
+ // Reverse bits of negative-shifts
+ code.vmovaps(xmm0, code.BConst<64>(xword, 0x8040201008040201));
+ code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0);
+
+ // Turn all negative shifts into left-shifts
+ code.pabsb(left_shift, left_shift);
+
+ const Xbyak::Opmask valid_index = k2;
+ code.vptestnmb(valid_index, left_shift, code.BConst<8>(xword, 0xF8));
+
+ // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only applies
+ // when the multiplication overflows. Masking away any bits that would have
+ // overflowed turns the polynomial-multiplication into regular modulo-multiplication
+ code.movdqa(tmp, code.Const(xword, 0x01'03'07'0f'1f'3f'7f'ff, 0));
+ code.vpshufb(tmp | valid_index | T_z, tmp, left_shift);
+ code.pand(result, tmp);
+
+ // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc
+ code.pxor(tmp, tmp);
+ code.movsd(tmp, xmm0);
+ code.pshufb(tmp, left_shift);
+
+ code.gf2p8mulb(result, tmp);
+
+ // Un-reverse bits of negative-shifts
+ code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u8>);
+ });
+}
+
+void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = xmm16;
+ const Xbyak::Xmm tmp = xmm17;
+
+ code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.vpxord(right_shift, right_shift, right_shift);
+ code.vpsubw(right_shift, right_shift, left_shift);
+ code.vpandd(left_shift, left_shift, tmp);
+ code.vpandd(right_shift, right_shift, tmp);
+
+ code.vpsllvw(tmp, result, left_shift);
+ code.vpsrlvw(result, result, right_shift);
+ code.vpord(result, result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u16>);
+ });
+}
+
+void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF));
+ code.vpxor(right_shift, right_shift, right_shift);
+ code.vpsubd(right_shift, right_shift, left_shift);
+ code.vpand(left_shift, left_shift, tmp);
+ code.vpand(right_shift, right_shift, tmp);
+
+ code.vpsllvd(tmp, result, left_shift);
+ code.vpsrlvd(result, result, right_shift);
+ code.vpor(result, result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u32>);
+ });
+}
+
+void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovdqa(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF));
+ code.vpxor(right_shift, right_shift, right_shift);
+ code.vpsubq(right_shift, right_shift, left_shift);
+ code.vpand(left_shift, left_shift, tmp);
+ code.vpand(right_shift, right_shift, tmp);
+
+ code.vpsllvq(tmp, result, left_shift);
+ code.vpsrlvq(result, result, right_shift);
+ code.vpor(result, result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u64>);
+ });
+}
+
+void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.pcmpgtb(tmp_b, a);
+ code.pand(b, tmp_b);
+ code.pandn(tmp_b, a);
+ code.por(tmp_b, b);
+
+ ctx.reg_alloc.DefineValue(inst, tmp_b);
+}
+
+void EmitX64::EmitVectorMaxS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
+}
+
+void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.pcmpgtd(tmp_b, a);
+ code.pand(b, tmp_b);
+ code.pandn(tmp_b, a);
+ code.por(tmp_b, b);
+
+ ctx.reg_alloc.DefineValue(inst, tmp_b);
+}
+
+void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.vpcmpgtq(xmm0, y, x);
+ code.pblendvb(x, y);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); });
+ });
+}
+
+void EmitX64::EmitVectorMaxU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
+}
+
+void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.psubusw(a, b);
+ code.paddw(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.pxor(tmp_b, tmp);
+ code.pxor(tmp, a);
+
+ code.pcmpgtd(tmp, tmp_b);
+ code.pand(a, tmp);
+ code.pandn(tmp, b);
+ code.por(a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
+ code.vpsubq(tmp, y, xmm0);
+ code.vpsubq(xmm0, x, xmm0);
+ code.vpcmpgtq(xmm0, tmp, xmm0);
+ code.pblendvb(x, y);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); });
+ });
+}
+
+void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.pcmpgtb(tmp_b, a);
+ code.pand(a, tmp_b);
+ code.pandn(tmp_b, b);
+ code.por(a, tmp_b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMinS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
+}
+
+void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.pcmpgtd(tmp_b, a);
+ code.pand(a, tmp_b);
+ code.pandn(tmp_b, b);
+ code.por(a, tmp_b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.vpcmpgtq(xmm0, y, x);
+ code.pblendvb(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); });
+ });
+}
+
+void EmitX64::EmitVectorMinU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
+}
+
+void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+
+ code.psubusw(tmp_b, a);
+ code.psubw(b, tmp_b);
+
+ ctx.reg_alloc.DefineValue(inst, b);
+}
+
+void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ const Xbyak::Xmm sint_max_plus_one = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(sint_max_plus_one, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+
+ const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_a, a);
+ code.psubd(tmp_a, sint_max_plus_one);
+
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp_b, b);
+ code.psubd(tmp_b, sint_max_plus_one);
+
+ code.pcmpgtd(tmp_b, tmp_a);
+ code.pand(a, tmp_b);
+ code.pandn(tmp_b, b);
+ code.por(a, tmp_b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
+ code.vpsubq(tmp, y, xmm0);
+ code.vpsubq(xmm0, x, xmm0);
+ code.vpcmpgtq(xmm0, tmp, xmm0);
+ code.pblendvb(y, x);
+
+ ctx.reg_alloc.DefineValue(inst, y);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); });
+ });
+}
+
+void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm();
+
+ // TODO: Optimize
+ code.movdqa(tmp_a, a);
+ code.movdqa(tmp_b, b);
+ code.pmullw(a, b);
+ code.psrlw(tmp_a, 8);
+ code.psrlw(tmp_b, 8);
+ code.pmullw(tmp_a, tmp_b);
+ code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.psllw(tmp_a, 8);
+ code.por(a, tmp_a);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmullw);
+}
+
+void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, a);
+ code.psrlq(a, 32);
+ code.pmuludq(tmp, b);
+ code.psrlq(b, 32);
+ code.pmuludq(a, b);
+ code.pshufd(tmp, tmp, 0b00001000);
+ code.pshufd(b, a, 0b00001000);
+ code.punpckldq(tmp, b);
+
+ ctx.reg_alloc.DefineValue(inst, tmp);
+}
+
+void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+ EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
+ return;
+ }
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr();
+
+ code.movq(tmp1, a);
+ code.movq(tmp2, b);
+ code.imul(tmp2, tmp1);
+ code.pextrq(tmp1, a, 1);
+ code.movq(a, tmp2);
+ code.pextrq(tmp2, b, 1);
+ code.imul(tmp1, tmp2);
+ code.pinsrq(a, tmp1, 1);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ return;
+ }
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp1, a);
+ code.movdqa(tmp2, a);
+ code.movdqa(tmp3, b);
+
+ code.psrlq(tmp1, 32);
+ code.psrlq(tmp3, 32);
+
+ code.pmuludq(tmp2, b);
+ code.pmuludq(tmp3, a);
+ code.pmuludq(b, tmp1);
+
+ code.paddq(b, tmp3);
+ code.psllq(b, 32);
+ code.paddq(tmp2, b);
+
+ ctx.reg_alloc.DefineValue(inst, tmp2);
+}
+
+void EmitX64::EmitVectorMultiplySignedWiden8(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplySignedWiden8");
+}
+
+void EmitX64::EmitVectorMultiplySignedWiden16(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplySignedWiden16");
+}
+
+void EmitX64::EmitVectorMultiplySignedWiden32(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplySignedWiden32");
+}
+
+void EmitX64::EmitVectorMultiplyUnsignedWiden8(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden8");
+}
+
+void EmitX64::EmitVectorMultiplyUnsignedWiden16(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden16");
+}
+
+void EmitX64::EmitVectorMultiplyUnsignedWiden32(EmitContext&, IR::Inst*) {
+ ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden32");
+}
+
+void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmovwb(result, a);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zeros, zeros);
+ code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.packuswb(a, zeros);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmovdw(result, a);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zeros, zeros);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pblendw(a, zeros, 0b10101010);
+ code.packusdw(a, zeros);
+ } else {
+ code.pslld(a, 16);
+ code.psrad(a, 16);
+ code.packssdw(a, zeros);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmovqd(result, a);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+
+ code.pxor(zeros, zeros);
+ code.shufps(a, zeros, 0b00001000);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ code.vpternlogq(result, operand, operand, u8(~Tern::c));
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm();
+ code.pcmpeqw(xmm_b, xmm_b);
+ code.pxor(xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::por);
+}
+
+void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.punpcklqdq(xmm_a, xmm_b);
+ code.movdqa(tmp, xmm_a);
+ code.psllw(xmm_a, 8);
+ code.paddw(xmm_a, tmp);
+ code.pxor(tmp, tmp);
+ code.psrlw(xmm_a, 8);
+ code.packuswb(xmm_a, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.punpcklqdq(xmm_a, xmm_b);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pxor(tmp, tmp);
+ code.phaddw(xmm_a, tmp);
+ } else {
+ code.movdqa(tmp, xmm_a);
+ code.pslld(xmm_a, 16);
+ code.paddd(xmm_a, tmp);
+ code.pxor(tmp, tmp);
+ code.psrad(xmm_a, 16);
+ code.packssdw(xmm_a, tmp); // Note: packusdw is SSE4.1, hence the arithmetic shift above.
+ }
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.punpcklqdq(xmm_a, xmm_b);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pxor(tmp, tmp);
+ code.phaddd(xmm_a, tmp);
+ } else {
+ code.movdqa(tmp, xmm_a);
+ code.psllq(xmm_a, 32);
+ code.paddq(xmm_a, tmp);
+ code.psrlq(xmm_a, 32);
+ code.pshufd(xmm_a, xmm_a, 0b11011000);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.movdqa(d, b);
+ code.psllw(a, 8);
+ code.psllw(b, 8);
+ code.paddw(a, c);
+ code.paddw(b, d);
+ code.psrlw(a, 8);
+ code.psrlw(b, 8);
+ code.packuswb(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.phaddw(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ } else {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.movdqa(d, b);
+ code.pslld(a, 16);
+ code.pslld(b, 16);
+ code.paddd(a, c);
+ code.paddd(b, d);
+ code.psrad(a, 16);
+ code.psrad(b, 16);
+ code.packssdw(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ }
+}
+
+void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.phaddd(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ } else {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.movdqa(d, b);
+ code.psllq(a, 32);
+ code.psllq(b, 32);
+ code.paddq(a, c);
+ code.paddq(b, d);
+ code.shufps(a, b, 0b11011101);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ }
+}
+
+void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.punpcklqdq(a, b);
+ code.punpckhqdq(c, b);
+ code.paddq(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.psllw(a, 8);
+ code.psraw(c, 8);
+ code.psraw(a, 8);
+ code.paddw(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.pslld(a, 16);
+ code.psrad(c, 16);
+ code.psrad(a, 16);
+ code.paddd(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ const Xbyak::Xmm c = xmm16;
+ code.vpsraq(c, a, 32);
+ code.vpsllq(a, a, 32);
+ code.vpsraq(a, a, 32);
+ code.vpaddq(a, a, c);
+ } else {
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.psllq(a, 32);
+ code.movdqa(tmp1, code.Const(xword, 0x80000000'00000000, 0x80000000'00000000));
+ code.movdqa(tmp2, tmp1);
+ code.pand(tmp1, a);
+ code.pand(tmp2, c);
+ code.psrlq(a, 32);
+ code.psrlq(c, 32);
+ code.psrad(tmp1, 31);
+ code.psrad(tmp2, 31);
+ code.por(a, tmp1);
+ code.por(c, tmp2);
+ code.paddq(a, c);
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.psllw(a, 8);
+ code.psrlw(c, 8);
+ code.psrlw(a, 8);
+ code.paddw(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.pslld(a, 16);
+ code.psrld(c, 16);
+ code.psrld(a, 16);
+ code.paddd(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(c, a);
+ code.psllq(a, 32);
+ code.psrlq(c, 32);
+ code.psrlq(a, 32);
+ code.paddq(a, c);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+template<typename T, typename Function>
+static void PairedOperation(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y, Function fn) {
+ const size_t range = x.size() / 2;
+
+ for (size_t i = 0; i < range; i++) {
+ result[i] = fn(x[2 * i], x[2 * i + 1]);
+ }
+
+ for (size_t i = 0; i < range; i++) {
+ result[range + i] = fn(y[2 * i], y[2 * i + 1]);
+ }
+}
+
+template<typename T, typename Function>
+static void LowerPairedOperation(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y, Function fn) {
+ const size_t range = x.size() / 4;
+
+ for (size_t i = 0; i < range; i++) {
+ result[i] = fn(x[2 * i], x[2 * i + 1]);
+ }
+
+ for (size_t i = 0; i < range; i++) {
+ result[range + i] = fn(y[2 * i], y[2 * i + 1]);
+ }
+}
+
+template<typename T>
+static void PairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
+ PairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); });
+}
+
+template<typename T>
+static void PairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
+ PairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); });
+}
+
+template<typename T>
+static void LowerPairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
+ LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); });
+}
+
+template<typename T>
+static void LowerPairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
+ LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); });
+}
+
+template<typename Function>
+static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.punpcklqdq(x, y);
+ code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.movhlps(y, x);
+ code.movq(x, x);
+ (code.*fn)(x, y);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+template<typename Function>
+static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.pshufb(x, tmp);
+ code.pshufb(y, tmp);
+
+ code.movaps(tmp, x);
+ code.shufps(tmp, y, 0b01'00'01'00);
+
+ code.shufps(x, y, 0b11'10'11'10);
+
+ (code.*fn)(x, tmp);
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+template<typename Function>
+static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+
+ code.pshufhw(x, x, 0b11'01'10'00);
+ code.pshufhw(y, y, 0b11'01'10'00);
+
+ // move pairs of even/odd-indexed elements into one register each
+
+ // tmp = x[0, 2], x[4, 6], y[0, 2], y[4, 6]
+ code.movaps(tmp, x);
+ code.shufps(tmp, y, 0b10'00'10'00);
+ // x = x[1, 3], x[5, 7], y[1, 3], y[5, 7]
+ code.shufps(x, y, 0b11'01'11'01);
+
+ (code.*fn)(x, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+template<typename Function>
+static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+
+ // move pairs of even/odd-indexed elements into one register each
+
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+
+ (code.*fn)(x, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+
+ (code.*fn)(x, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
+ PairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
+}
+
+void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, x);
+ code.shufps(tmp, y, 0b10001000);
+ code.shufps(x, y, 0b11011101);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmaxsd(x, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ } else {
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp2, tmp);
+ code.pcmpgtd(tmp2, x);
+ code.pand(tmp, tmp2);
+ code.pandn(tmp2, x);
+ code.por(tmp2, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, tmp2);
+ }
+}
+
+void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ PairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
+ PairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp1, x);
+ code.shufps(tmp1, y, 0b10001000);
+ code.shufps(x, y, 0b11011101);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmaxud(x, tmp1);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ } else {
+ const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp2, x);
+
+ code.pxor(tmp2, tmp3);
+ code.pxor(tmp3, tmp1);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp1, tmp3);
+ code.pandn(tmp3, x);
+ code.por(tmp1, tmp3);
+
+ ctx.reg_alloc.DefineValue(inst, tmp1);
+ }
+}
+
+void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
+ PairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
+}
+
+void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, x);
+ code.shufps(tmp, y, 0b10001000);
+ code.shufps(x, y, 0b11011101);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pminsd(x, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ } else {
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(tmp2, x);
+ code.pcmpgtd(tmp2, tmp);
+ code.pand(tmp, tmp2);
+ code.pandn(tmp2, x);
+ code.por(tmp2, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, tmp2);
+ }
+}
+
+void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ PairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
+ PairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp1, x);
+ code.shufps(tmp1, y, 0b10001000);
+ code.shufps(x, y, 0b11011101);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pminud(x, tmp1);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+ } else {
+ const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp2, tmp1);
+
+ code.pxor(tmp2, tmp3);
+ code.pxor(tmp3, x);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp1, tmp3);
+ code.pandn(tmp3, x);
+ code.por(tmp1, tmp3);
+
+ ctx.reg_alloc.DefineValue(inst, tmp1);
+ }
+}
+
+void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
+ LowerPairedMax(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
+ LowerPairedMin(result, a, b);
+ });
+}
+
+template<typename D, typename T>
+static D PolynomialMultiply(T lhs, T rhs) {
+ constexpr size_t bit_size = mcl::bitsizeof<T>;
+ const std::bitset<bit_size> operand(lhs);
+
+ D res = 0;
+ for (size_t i = 0; i < bit_size; i++) {
+ if (operand[i]) {
+ res ^= rhs << i;
+ }
+ }
+
+ return res;
+}
+
+void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ Xbyak::Label loop;
+
+ code.pxor(result, result);
+ code.movdqa(mask, code.Const(xword, 0x0101010101010101, 0x0101010101010101));
+ code.mov(counter, 8);
+
+ code.L(loop);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpand(xmm0, xmm_b, mask);
+ code.vpxor(alternate, result, xmm_a);
+ } else {
+ code.movdqa(xmm0, xmm_b);
+ code.movdqa(alternate, result);
+ code.pand(xmm0, mask);
+ code.pxor(alternate, xmm_a);
+ }
+ code.pcmpeqb(xmm0, mask);
+ code.paddb(mask, mask);
+ code.paddb(xmm_a, xmm_a);
+ code.pblendvb(result, alternate);
+ code.dec(counter);
+ code.jnz(loop);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply<u8, u8>);
+ });
+}
+
+void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ Xbyak::Label loop;
+
+ code.pmovzxbw(xmm_a, xmm_a);
+ code.pmovzxbw(xmm_b, xmm_b);
+ code.pxor(result, result);
+ code.movdqa(mask, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
+ code.mov(counter, 8);
+
+ code.L(loop);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpand(xmm0, xmm_b, mask);
+ code.vpxor(alternate, result, xmm_a);
+ } else {
+ code.movdqa(xmm0, xmm_b);
+ code.movdqa(alternate, result);
+ code.pand(xmm0, mask);
+ code.pxor(alternate, xmm_a);
+ }
+ code.pcmpeqw(xmm0, mask);
+ code.paddw(mask, mask);
+ code.paddw(xmm_a, xmm_a);
+ code.pblendvb(result, alternate);
+ code.dec(counter);
+ code.jnz(loop);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = PolynomialMultiply<u16, u8>(a[i], b[i]);
+ }
+ });
+}
+
+void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::PCLMULQDQ)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.pclmulqdq(xmm_a, xmm_b, 0x00);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ return;
+ }
+
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
+ const auto handle_high_bits = [](u64 lhs, u64 rhs) {
+ constexpr size_t bit_size = mcl::bitsizeof<u64>;
+ u64 result = 0;
+
+ for (size_t i = 1; i < bit_size; i++) {
+ if (mcl::bit::get_bit(i, lhs)) {
+ result ^= rhs >> (bit_size - i);
+ }
+ }
+
+ return result;
+ };
+
+ result[0] = PolynomialMultiply<u64, u64>(a[0], b[0]);
+ result[1] = handle_high_bits(a[0], b[0]);
+ });
+}
+
+void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.vpopcntb(data, data);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(high_a, low_a);
+ code.psrlw(high_a, 4);
+ code.movdqa(tmp1, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F));
+ code.pand(high_a, tmp1); // High nibbles
+ code.pand(low_a, tmp1); // Low nibbles
+
+ code.movdqa(tmp1, code.Const(xword, 0x0302020102010100, 0x0403030203020201));
+ code.movdqa(tmp2, tmp1);
+ code.pshufb(tmp1, low_a);
+ code.pshufb(tmp2, high_a);
+
+ code.paddb(tmp1, tmp2);
+
+ ctx.reg_alloc.DefineValue(inst, tmp1);
+ return;
+ }
+
+ EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a) {
+ std::transform(a.begin(), a.end(), result.begin(), [](u8 val) {
+ return static_cast<u8>(mcl::bit::count_ones(val));
+ });
+ });
+}
+
+void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::GFNI)) {
+ code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0);
+ } else {
+ const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
+ code.pand(high_nibble_reg, data);
+ code.pxor(data, high_nibble_reg);
+ code.psrld(high_nibble_reg, 4);
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ // High lookup
+ const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010));
+ code.pshufb(high_reversed_reg, data);
+
+ // Low lookup (low nibble equivalent of the above)
+ code.movdqa(data, code.Const(xword, 0x0E060A020C040800, 0x0F070B030D050901));
+ code.pshufb(data, high_nibble_reg);
+ code.por(data, high_reversed_reg);
+ } else {
+ code.pslld(data, 4);
+ code.por(data, high_nibble_reg);
+
+ code.movdqa(high_nibble_reg, code.Const(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC));
+ code.pand(high_nibble_reg, data);
+ code.pxor(data, high_nibble_reg);
+ code.psrld(high_nibble_reg, 2);
+ code.pslld(data, 2);
+ code.por(data, high_nibble_reg);
+
+ code.movdqa(high_nibble_reg, code.Const(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA));
+ code.pand(high_nibble_reg, data);
+ code.pxor(data, high_nibble_reg);
+ code.psrld(high_nibble_reg, 1);
+ code.paddd(data, data);
+ code.por(data, high_nibble_reg);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp, data);
+ code.psllw(tmp, 8);
+ code.psrlw(data, 8);
+ code.por(data, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // TODO: PSHUFB
+
+ code.movdqa(tmp, data);
+ code.psllw(tmp, 8);
+ code.psrlw(data, 8);
+ code.por(data, tmp);
+ code.pshuflw(data, data, 0b10110001);
+ code.pshufhw(data, data, 0b10110001);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.pshuflw(data, data, 0b10110001);
+ code.pshufhw(data, data, 0b10110001);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // TODO: PSHUFB
+
+ code.movdqa(tmp, data);
+ code.psllw(tmp, 8);
+ code.psrlw(data, 8);
+ code.por(data, tmp);
+ code.pshuflw(data, data, 0b00011011);
+ code.pshufhw(data, data, 0b00011011);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.pshuflw(data, data, 0b00011011);
+ code.pshufhw(data, data, 0b00011011);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.pshuflw(data, data, 0b01001110);
+ code.pshufhw(data, data, 0b01001110);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm temp = xmm0;
+
+ // Add upper elements to lower elements
+ code.pshufd(temp, data, 0b01'00'11'10);
+ code.paddb(data, temp);
+
+ // Add adjacent 8-bit values into 64-bit lanes
+ code.pxor(temp, temp);
+ code.psadbw(data, temp);
+
+ // Zero-extend lower 8-bits
+ code.pslldq(data, 15);
+ code.psrldq(data, 15);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm temp = xmm0;
+
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pxor(temp, temp);
+ code.phaddw(data, xmm0);
+ code.phaddw(data, xmm0);
+ code.phaddw(data, xmm0);
+ } else {
+ // Add upper elements to lower elements
+ code.pshufd(temp, data, 0b00'01'10'11);
+ code.paddw(data, temp);
+
+ // Add pairs of 16-bit values into 32-bit lanes
+ code.movdqa(temp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
+ code.pmaddwd(data, temp);
+
+ // Sum adjacent 32-bit lanes
+ code.pshufd(temp, data, 0b10'11'00'01);
+ code.paddd(data, temp);
+ // Zero-extend lower 16-bits
+ code.pslldq(data, 14);
+ code.psrldq(data, 14);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm temp = xmm0;
+
+ // Add upper elements to lower elements(reversed)
+ code.pshufd(temp, data, 0b00'01'10'11);
+ code.paddd(data, temp);
+
+ // Sum adjacent 32-bit lanes
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.phaddd(data, data);
+ } else {
+ code.pshufd(temp, data, 0b10'11'00'01);
+ code.paddd(data, temp);
+ }
+
+ // shift upper-most result into lower-most lane
+ code.psrldq(data, 12);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm temp = xmm0;
+
+ // Add upper elements to lower elements
+ code.pshufd(temp, data, 0b01'00'11'10);
+ code.paddq(data, temp);
+
+ // Zero-extend lower 64-bits
+ code.movq(data, data);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const u8 shift_amount = args[1].GetImmediateU8();
+ ASSERT(shift_amount % 32 == 0);
+ const u8 shuffle_imm = std::rotr<u8>(0b11100100, shift_amount / 32 * 2);
+
+ code.pshufd(result, operand, shuffle_imm);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ switch (esize) {
+ case 8: {
+ const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
+
+ code.paddb(a, vec_128);
+ code.paddb(b, vec_128);
+ code.pavgb(a, b);
+ code.paddb(a, vec_128);
+ break;
+ }
+ case 16: {
+ const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
+
+ code.paddw(a, vec_32768);
+ code.paddw(b, vec_32768);
+ code.pavgw(a, b);
+ code.paddw(a, vec_32768);
+ break;
+ }
+ case 32: {
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+ code.movdqa(tmp1, a);
+
+ code.por(a, b);
+ code.psrad(tmp1, 1);
+ code.psrad(b, 1);
+ code.pslld(a, 31);
+ code.paddd(b, tmp1);
+ code.psrld(a, 31);
+ code.paddd(a, b);
+ break;
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddSigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddSigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddSigned(32, ctx, inst, code);
+}
+
+static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ switch (esize) {
+ case 8:
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgb);
+ return;
+ case 16:
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgw);
+ return;
+ case 32: {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(tmp1, a);
+
+ code.por(a, b);
+ code.psrld(tmp1, 1);
+ code.psrld(b, 1);
+ code.pslld(a, 31);
+ code.paddd(b, tmp1);
+ code.psrld(a, 31);
+ code.paddd(a, b);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+ break;
+ }
+ }
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddUnsigned(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddUnsigned(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorRoundingHalvingAddU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorRoundingHalvingAddUnsigned(32, ctx, inst, code);
+}
+
+template<typename T, typename U>
+static void RoundingShiftLeft(VectorArray<T>& out, const VectorArray<T>& lhs, const VectorArray<U>& rhs) {
+ using signed_type = std::make_signed_t<T>;
+ using unsigned_type = std::make_unsigned_t<T>;
+
+ constexpr auto bit_size = static_cast<s64>(mcl::bitsizeof<T>);
+
+ for (size_t i = 0; i < out.size(); i++) {
+ const s64 extended_shift = static_cast<s64>(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF));
+
+ if (extended_shift >= 0) {
+ if (extended_shift >= bit_size) {
+ out[i] = 0;
+ } else {
+ out[i] = static_cast<T>(static_cast<unsigned_type>(lhs[i]) << extended_shift);
+ }
+ } else {
+ if ((std::is_unsigned_v<T> && extended_shift < -bit_size) || (std::is_signed_v<T> && extended_shift <= -bit_size)) {
+ out[i] = 0;
+ } else {
+ const s64 shift_value = -extended_shift - 1;
+ const T shifted = (lhs[i] & (static_cast<signed_type>(1) << shift_value)) >> shift_value;
+
+ if (extended_shift == -bit_size) {
+ out[i] = shifted;
+ } else {
+ out[i] = (lhs[i] >> -extended_shift) + shifted;
+ }
+ }
+ }
+ }
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftS8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& lhs, const VectorArray<s8>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftS16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& lhs, const VectorArray<s16>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftS32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& lhs, const VectorArray<s32>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftS64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& lhs, const VectorArray<s64>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftU8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& lhs, const VectorArray<s8>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftU16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& lhs, const VectorArray<s16>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftU32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& lhs, const VectorArray<s32>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<s64>& rhs) {
+ RoundingShiftLeft(result, lhs, rhs);
+ });
+}
+
+void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.pmovsxbw(a, a);
+ ctx.reg_alloc.DefineValue(inst, a);
+ } else {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.pxor(result, result);
+ code.punpcklbw(result, a);
+ code.psraw(result, 8);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.pmovsxwd(a, a);
+ ctx.reg_alloc.DefineValue(inst, a);
+ } else {
+ const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.pxor(result, result);
+ code.punpcklwd(result, a);
+ code.psrad(result, 16);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+}
+
+void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmovsxdq(a, a);
+ } else {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(tmp, a);
+ code.psrad(tmp, 31);
+ code.punpckldq(a, tmp);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr();
+
+ code.movq(gpr_tmp, data);
+ code.sar(gpr_tmp, 63);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pinsrq(data, gpr_tmp, 1);
+ } else {
+ const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
+
+ code.movq(xmm_tmp, gpr_tmp);
+ code.punpcklqdq(data, xmm_tmp);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ // only signed 16-bit min/max are available below SSE4.1
+ if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) {
+ code.movdqa(tmp, x);
+
+ switch (esize) {
+ case 8:
+ code.pminsb(tmp, y);
+ code.pmaxsb(x, y);
+ code.psubb(x, tmp);
+ break;
+ case 16:
+ code.pminsw(tmp, y);
+ code.pmaxsw(x, y);
+ code.psubw(x, tmp);
+ break;
+ case 32:
+ code.pminsd(tmp, y);
+ code.pmaxsd(x, y);
+ code.psubd(x, tmp);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ } else {
+ code.movdqa(tmp, y);
+
+ switch (esize) {
+ case 8:
+ code.pcmpgtb(tmp, x);
+ code.psubb(x, y);
+ code.pxor(x, tmp);
+ code.psubb(x, tmp);
+ break;
+ case 32:
+ code.pcmpgtd(tmp, x);
+ code.psubd(x, y);
+ code.pxor(x, tmp);
+ code.psubd(x, tmp);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+void EmitX64::EmitVectorSignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedAbsoluteDifference(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorSignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedAbsoluteDifference(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedAbsoluteDifference(32, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
+ const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
+ const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (upper_inst) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmulhw(result, x, y);
+ } else {
+ code.movdqa(result, x);
+ code.pmulhw(result, y);
+ }
+
+ ctx.reg_alloc.DefineValue(upper_inst, result);
+ }
+
+ if (lower_inst) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmullw(result, x, y);
+ } else {
+ code.movdqa(result, x);
+ code.pmullw(result, y);
+ }
+ ctx.reg_alloc.DefineValue(lower_inst, result);
+ }
+}
+
+void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
+ const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
+ const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmulld(result, x, y);
+
+ ctx.reg_alloc.DefineValue(lower_inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ if (lower_inst) {
+ const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
+ code.vpmulld(lower_result, x, y);
+ ctx.reg_alloc.DefineValue(lower_inst, lower_result);
+ }
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmuldq(result, x, y);
+ code.vpsrlq(x, x, 32);
+ code.vpsrlq(y, y, 32);
+ code.vpmuldq(x, x, y);
+ code.shufps(result, x, 0b11011101);
+
+ ctx.reg_alloc.DefineValue(upper_inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
+
+ // calculate sign correction
+ code.movdqa(tmp, x);
+ code.movdqa(sign_correction, y);
+ code.psrad(tmp, 31);
+ code.psrad(sign_correction, 31);
+ code.pand(tmp, y);
+ code.pand(sign_correction, x);
+ code.paddd(sign_correction, tmp);
+ code.pand(sign_correction, code.Const(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF));
+
+ // calculate unsigned multiply
+ code.movdqa(tmp, x);
+ code.pmuludq(tmp, y);
+ code.psrlq(x, 32);
+ code.psrlq(y, 32);
+ code.pmuludq(x, y);
+
+ // put everything into place
+ code.pcmpeqw(upper_result, upper_result);
+ code.pcmpeqw(lower_result, lower_result);
+ code.psllq(upper_result, 32);
+ code.psrlq(lower_result, 32);
+ code.pand(upper_result, x);
+ code.pand(lower_result, tmp);
+ code.psrlq(tmp, 32);
+ code.psllq(x, 32);
+ code.por(upper_result, tmp);
+ code.por(lower_result, x);
+ code.psubd(upper_result, sign_correction);
+
+ if (upper_inst) {
+ ctx.reg_alloc.DefineValue(upper_inst, upper_result);
+ }
+ if (lower_inst) {
+ ctx.reg_alloc.DefineValue(lower_inst, lower_result);
+ }
+}
+
+static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm data_test = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Address mask = [esize, &code] {
+ switch (esize) {
+ case 8:
+ return code.Const(xword, 0x8080808080808080, 0x8080808080808080);
+ case 16:
+ return code.Const(xword, 0x8000800080008000, 0x8000800080008000);
+ case 32:
+ return code.Const(xword, 0x8000000080000000, 0x8000000080000000);
+ case 64:
+ return code.Const(xword, 0x8000000000000000, 0x8000000000000000);
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) {
+ switch (esize) {
+ case 8:
+ code.pcmpeqb(x, y);
+ break;
+ case 16:
+ code.pcmpeqw(x, y);
+ break;
+ case 32:
+ code.pcmpeqd(x, y);
+ break;
+ case 64:
+ code.pcmpeqq(x, y);
+ break;
+ }
+ };
+
+ // Keep a copy of the initial data for determining whether or not
+ // to set the Q flag
+ code.movdqa(data_test, data);
+
+ switch (esize) {
+ case 8:
+ VectorAbs8(code, ctx, data);
+ break;
+ case 16:
+ VectorAbs16(code, ctx, data);
+ break;
+ case 32:
+ VectorAbs32(code, ctx, data);
+ break;
+ case 64:
+ VectorAbs64(code, ctx, data);
+ break;
+ }
+
+ code.movdqa(sign, mask);
+ vector_equality(sign, data);
+ code.pxor(data, sign);
+
+ // Check if the initial data contained any elements with the value 0x80.
+ // If any exist, then the Q flag needs to be set.
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.movdqa(sign, mask);
+ vector_equality(data_test, sign);
+ code.pmovmskb(bit, data_test);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAbs8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAbs(8, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAbs16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAbs(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAbs(32, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorSignedSaturatedAbs(64, code, ctx, inst);
+ return;
+ }
+
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) {
+ bool qc_flag = false;
+
+ for (size_t i = 0; i < result.size(); i++) {
+ if (static_cast<u64>(data[i]) == 0x8000000000000000) {
+ result[i] = 0x7FFFFFFFFFFFFFFF;
+ qc_flag = true;
+ } else {
+ result[i] = std::abs(data[i]);
+ }
+ }
+
+ return qc_flag;
+ });
+}
+
+template<size_t bit_width>
+static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ code.movdqa(xmm0, y);
+ ctx.reg_alloc.Release(y);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ switch (bit_width) {
+ case 8:
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddb(result, x, xmm0);
+ } else {
+ code.movdqa(result, x);
+ code.paddb(result, xmm0);
+ }
+ break;
+ case 16:
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddw(result, x, xmm0);
+ } else {
+ code.movdqa(result, x);
+ code.paddw(result, xmm0);
+ }
+ break;
+ case 32:
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddd(result, x, xmm0);
+ } else {
+ code.movdqa(result, x);
+ code.paddd(result, xmm0);
+ }
+ break;
+ case 64:
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddq(result, x, xmm0);
+ } else {
+ code.movdqa(result, x);
+ code.paddq(result, xmm0);
+ }
+ break;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ // xmm0 = majority(~y, x, res)
+ code.vpternlogd(xmm0, x, result, 0b10001110);
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpor(tmp, x, result);
+ code.pand(x, result);
+ code.vpblendvb(xmm0, tmp, x, xmm0);
+ } else {
+ code.movdqa(tmp, x);
+ code.pxor(x, result);
+ code.pand(tmp, result);
+ code.pandn(xmm0, x);
+ code.por(xmm0, tmp);
+ }
+
+ ctx.reg_alloc.Release(x);
+
+ switch (bit_width) {
+ case 8:
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+ code.pcmpeqb(tmp2, tmp2);
+ code.pxor(tmp, tmp);
+ code.vpblendvb(xmm0, tmp, tmp2, xmm0);
+ ctx.reg_alloc.Release(tmp2);
+ } else {
+ code.pand(xmm0, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
+ code.movdqa(tmp, xmm0);
+ code.psrlw(tmp, 7);
+ code.pxor(xmm0, xmm0);
+ code.psubb(xmm0, tmp);
+ }
+ break;
+ case 16:
+ code.psraw(xmm0, 15);
+ break;
+ case 32:
+ code.psrad(xmm0, 31);
+ break;
+ case 64:
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpsraq(xmm0, xmm0, 63);
+ } else {
+ code.psrad(xmm0, 31);
+ code.pshufd(xmm0, xmm0, 0b11110101);
+ }
+ break;
+ }
+
+ code.movdqa(tmp, xmm0);
+ switch (bit_width) {
+ case 8:
+ code.paddb(tmp, tmp);
+ code.psrlw(tmp, 1);
+ break;
+ case 16:
+ code.psrlw(tmp, 1);
+ break;
+ case 32:
+ code.psrld(tmp, 1);
+ break;
+ case 64:
+ code.psrlq(tmp, 1);
+ break;
+ }
+
+ const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pmovmskb(mask, xmm0);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pblendvb(result, tmp);
+ } else {
+ code.pandn(xmm0, result);
+ code.por(xmm0, tmp);
+ code.movdqa(result, xmm0);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAccumulateUnsigned<8>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAccumulateUnsigned<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAccumulateUnsigned<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedAccumulateUnsigned<64>(code, ctx, inst);
+}
+
+template<bool is_rounding>
+static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmulhw(upper_tmp, x, y);
+ } else {
+ code.movdqa(upper_tmp, x);
+ code.pmulhw(upper_tmp, y);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmullw(lower_tmp, x, y);
+ } else {
+ code.movdqa(lower_tmp, x);
+ code.pmullw(lower_tmp, y);
+ }
+
+ ctx.reg_alloc.Release(x);
+ ctx.reg_alloc.Release(y);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ if constexpr (is_rounding) {
+ code.vpsrlw(lower_tmp, lower_tmp, 14);
+ code.vpaddw(lower_tmp, lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
+ code.vpsrlw(lower_tmp, lower_tmp, 1);
+ } else {
+ code.vpsrlw(lower_tmp, lower_tmp, 15);
+ }
+ code.vpaddw(upper_tmp, upper_tmp, upper_tmp);
+ code.vpaddw(result, upper_tmp, lower_tmp);
+ code.vpcmpeqw(upper_tmp, result, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
+ code.vpxor(result, result, upper_tmp);
+ } else {
+ code.paddw(upper_tmp, upper_tmp);
+ if constexpr (is_rounding) {
+ code.psrlw(lower_tmp, 14);
+ code.paddw(lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001));
+ code.psrlw(lower_tmp, 1);
+ } else {
+ code.psrlw(lower_tmp, 15);
+ }
+ code.movdqa(result, upper_tmp);
+ code.paddw(result, lower_tmp);
+ code.movdqa(upper_tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
+ code.pcmpeqw(upper_tmp, result);
+ code.pxor(result, upper_tmp);
+ }
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pmovmskb(bit, upper_tmp);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHigh16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedDoublingMultiply16<false>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedDoublingMultiply16<true>(code, ctx, inst);
+}
+
+template<bool is_rounding>
+void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmuldq(odds, x, y);
+ code.vpsrlq(x, x, 32);
+ code.vpsrlq(y, y, 32);
+ code.vpmuldq(even, x, y);
+
+ ctx.reg_alloc.Release(x);
+ ctx.reg_alloc.Release(y);
+
+ code.vpaddq(odds, odds, odds);
+ code.vpaddq(even, even, even);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if constexpr (is_rounding) {
+ code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
+ code.vpaddq(odds, odds, result);
+ code.vpaddq(even, even, result);
+ }
+
+ code.vpsrlq(result, odds, 32);
+ code.vblendps(result, result, even, 0b1010);
+
+ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.vpxor(result, result, mask);
+ code.pmovmskb(bit, mask);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.Release(mask);
+ ctx.reg_alloc.Release(bit);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ // calculate sign correction
+ code.movdqa(tmp, x);
+ code.movdqa(sign_correction, y);
+ code.psrad(tmp, 31);
+ code.psrad(sign_correction, 31);
+ code.pand(tmp, y);
+ code.pand(sign_correction, x);
+ code.paddd(sign_correction, tmp);
+ code.pslld(sign_correction, 1);
+
+ // unsigned multiply
+ code.movdqa(tmp, x);
+ code.pmuludq(tmp, y);
+ code.psrlq(x, 32);
+ code.psrlq(y, 32);
+ code.pmuludq(x, y);
+
+ // double
+ code.paddq(tmp, tmp);
+ code.paddq(x, x);
+
+ if constexpr (is_rounding) {
+ code.movdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
+ code.paddq(tmp, result);
+ code.paddq(x, result);
+ }
+
+ // put everything into place
+ code.pcmpeqw(result, result);
+ code.psllq(result, 32);
+ code.pand(result, x);
+ code.psrlq(tmp, 32);
+ code.por(result, tmp);
+ code.psubd(result, sign_correction);
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+
+ code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.pcmpeqd(tmp, result);
+ code.pxor(result, tmp);
+ code.pmovmskb(bit, tmp);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHigh32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedDoublingMultiply32<false>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedDoublingMultiply32<true>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.punpcklwd(x, x);
+ code.punpcklwd(y, y);
+ code.pmaddwd(x, y);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpcmpeqd(y, x, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.vpxor(x, x, y);
+ } else {
+ code.movdqa(y, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.pcmpeqd(y, x);
+ code.pxor(x, y);
+ }
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pmovmskb(bit, y);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmovsxdq(x, x);
+ code.vpmovsxdq(y, y);
+ code.vpmuldq(x, x, y);
+ code.vpaddq(x, x, x);
+ } else {
+ const Xbyak::Reg64 a = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 b = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 c = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 d = ctx.reg_alloc.ScratchGpr();
+
+ code.movq(c, x);
+ code.movq(d, y);
+ code.movsxd(a, c.cvt32());
+ code.movsxd(b, d.cvt32());
+ code.sar(c, 32);
+ code.sar(d, 32);
+ code.imul(a, b);
+ code.imul(c, d);
+ code.movq(x, a);
+ code.movq(y, c);
+ code.punpcklqdq(x, y);
+ code.paddq(x, x);
+
+ ctx.reg_alloc.Release(a);
+ ctx.reg_alloc.Release(b);
+ ctx.reg_alloc.Release(c);
+ ctx.reg_alloc.Release(d);
+ }
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpcmpeqq(y, x, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
+ code.vpxor(x, x, y);
+ code.vpmovmskb(bit, y);
+ } else {
+ code.movdqa(y, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
+ code.pcmpeqd(y, x);
+ code.shufps(y, y, 0b11110101);
+ code.pxor(x, y);
+ code.pmovmskb(bit, y);
+ }
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, x);
+}
+
+static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(dest, src);
+ code.pxor(xmm0, xmm0);
+
+ switch (original_esize) {
+ case 16:
+ code.packsswb(dest, xmm0);
+ code.movdqa(sign, src);
+ code.psraw(sign, 15);
+ code.packsswb(sign, sign);
+ code.movdqa(reconstructed, dest);
+ code.punpcklbw(reconstructed, sign);
+ break;
+ case 32:
+ code.packssdw(dest, xmm0);
+ code.movdqa(reconstructed, dest);
+ code.movdqa(sign, dest);
+ code.psraw(sign, 15);
+ code.punpcklwd(reconstructed, sign);
+ break;
+ default:
+ UNREACHABLE();
+ }
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pcmpeqd(reconstructed, src);
+ code.movmskps(bit, reconstructed);
+ code.xor_(bit, 0b1111);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNarrowToSigned(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNarrowToSigned(32, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s64>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const s64 saturated = std::clamp<s64>(a[i], -s64(0x80000000), s64(0x7FFFFFFF));
+ result[i] = static_cast<s32>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm();
+
+ code.movdqa(dest, src);
+ code.pxor(xmm0, xmm0);
+
+ switch (original_esize) {
+ case 16:
+ code.packuswb(dest, xmm0);
+ code.movdqa(reconstructed, dest);
+ code.punpcklbw(reconstructed, xmm0);
+ break;
+ case 32:
+ ASSERT(code.HasHostFeature(HostFeature::SSE41));
+ code.packusdw(dest, xmm0); // SSE4.1
+ code.movdqa(reconstructed, dest);
+ code.punpcklwd(reconstructed, xmm0);
+ break;
+ default:
+ UNREACHABLE();
+ }
+
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pcmpeqd(reconstructed, src);
+ code.movmskps(bit, reconstructed);
+ code.xor_(bit, 0b1111);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, dest);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNarrowToUnsigned(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst);
+ return;
+ }
+
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<s32>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const s32 saturated = std::clamp<s32>(a[i], 0, 0xFFFF);
+ result[i] = static_cast<u16>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<s64>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const s64 saturated = std::clamp<s64>(a[i], 0, 0xFFFFFFFF);
+ result[i] = static_cast<u32>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Address mask = [esize, &code] {
+ switch (esize) {
+ case 8:
+ return code.Const(xword, 0x8080808080808080, 0x8080808080808080);
+ case 16:
+ return code.Const(xword, 0x8000800080008000, 0x8000800080008000);
+ case 32:
+ return code.Const(xword, 0x8000000080000000, 0x8000000080000000);
+ case 64:
+ return code.Const(xword, 0x8000000000000000, 0x8000000000000000);
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
+ switch (esize) {
+ case 8:
+ code.pcmpeqb(x, y);
+ break;
+ case 16:
+ code.pcmpeqw(x, y);
+ break;
+ case 32:
+ code.pcmpeqd(x, y);
+ break;
+ case 64:
+ code.pcmpeqq(x, y);
+ break;
+ }
+ };
+
+ code.movdqa(tmp, data);
+ vector_equality(tmp, mask);
+
+ // Perform negation
+ code.pxor(zero, zero);
+ switch (esize) {
+ case 8:
+ code.psubsb(zero, data);
+ break;
+ case 16:
+ code.psubsw(zero, data);
+ break;
+ case 32:
+ code.psubd(zero, data);
+ code.pxor(zero, tmp);
+ break;
+ case 64:
+ code.psubq(zero, data);
+ code.pxor(zero, tmp);
+ break;
+ }
+
+ // Check if any elements matched the mask prior to performing saturation. If so, set the Q bit.
+ const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.pmovmskb(bit, tmp);
+ code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit);
+
+ ctx.reg_alloc.DefineValue(inst, zero);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNeg(8, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNeg(16, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturatedNeg(32, code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) {
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ EmitVectorSignedSaturatedNeg(64, code, ctx, inst);
+ return;
+ }
+
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) {
+ bool qc_flag = false;
+
+ for (size_t i = 0; i < result.size(); i++) {
+ if (static_cast<u64>(data[i]) == 0x8000000000000000) {
+ result[i] = 0x7FFFFFFFFFFFFFFF;
+ qc_flag = true;
+ } else {
+ result[i] = -data[i];
+ }
+ }
+
+ return qc_flag;
+ });
+}
+
+// MSVC requires the capture within the saturate lambda, but it's
+// determined to be unnecessary via clang and GCC.
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#endif
+template<typename T, typename U = std::make_unsigned_t<T>>
+static bool VectorSignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorArray<T>& data, const VectorArray<T>& shift_values) {
+ static_assert(std::is_signed_v<T>, "T must be signed.");
+
+ bool qc_flag = false;
+
+ constexpr size_t bit_size_minus_one = mcl::bitsizeof<T> - 1;
+
+ const auto saturate = [bit_size_minus_one](T value) {
+ return static_cast<T>((static_cast<U>(value) >> bit_size_minus_one) + (U{1} << bit_size_minus_one) - 1);
+ };
+
+ for (size_t i = 0; i < dst.size(); i++) {
+ const T element = data[i];
+ const T shift = std::clamp<T>(static_cast<T>(mcl::bit::sign_extend<8>(static_cast<U>(shift_values[i] & 0xFF))),
+ -static_cast<T>(bit_size_minus_one), std::numeric_limits<T>::max());
+
+ if (element == 0) {
+ dst[i] = 0;
+ } else if (shift < 0) {
+ dst[i] = static_cast<T>(element >> -shift);
+ } else if (static_cast<U>(shift) > bit_size_minus_one) {
+ dst[i] = saturate(element);
+ qc_flag = true;
+ } else {
+ const T shifted = T(U(element) << shift);
+
+ if ((shifted >> shift) != element) {
+ dst[i] = saturate(element);
+ qc_flag = true;
+ } else {
+ dst[i] = shifted;
+ }
+ }
+ }
+
+ return qc_flag;
+}
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s8>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s16>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s32>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s64>);
+}
+
+template<typename T, typename U = std::make_unsigned_t<T>>
+static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray<T>& dst, const VectorArray<T>& data, u8 shift_amount) {
+ static_assert(std::is_signed_v<T>, "T must be signed.");
+
+ bool qc_flag = false;
+ for (size_t i = 0; i < dst.size(); i++) {
+ const T element = data[i];
+ const T shift = static_cast<T>(shift_amount);
+
+ if (element == 0) {
+ dst[i] = 0;
+ } else if (element < 0) {
+ dst[i] = 0;
+ qc_flag = true;
+ } else {
+ const U shifted = static_cast<U>(element) << static_cast<U>(shift);
+ const U shifted_test = shifted >> static_cast<U>(shift);
+
+ if (shifted_test != static_cast<U>(element)) {
+ dst[i] = static_cast<T>(std::numeric_limits<U>::max());
+ qc_flag = true;
+ } else {
+ dst[i] = shifted;
+ }
+ }
+ }
+
+ return qc_flag;
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s8>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s16>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>);
+}
+
+void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s64>);
+}
+
+void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb);
+}
+
+void EmitX64::EmitVectorSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubw);
+}
+
+void EmitX64::EmitVectorSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubd);
+}
+
+void EmitX64::EmitVectorSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubq);
+}
+
+void EmitX64::EmitVectorTable(EmitContext&, IR::Inst* inst) {
+ // Do nothing. We *want* to hold on to the refcount for our arguments, so VectorTableLookup can use our arguments.
+ ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times");
+}
+
+void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
+ ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
+
+ const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
+ const bool is_defaults_zero = inst->GetArg(0).IsZero();
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) {
+ const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(args[2]) : ctx.reg_alloc.UseScratchXmm(args[2]);
+
+ const u64 index_count = mcl::bit::replicate_element<u8, u64>(static_cast<u8>(table_size * 8));
+
+ code.vpcmpub(k1, indicies, code.Const(xword, index_count, 0), CmpInt::LessThan);
+
+ switch (table_size) {
+ case 1: {
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+ if (is_defaults_zero) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.vpermb(result | k1 | T_z, indicies, xmm_table0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermb(result | k1, indicies, xmm_table0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ break;
+ }
+ case 2: {
+ const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
+ if (is_defaults_zero) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.vpermb(result | k1 | T_z, indicies, xmm0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermb(result | k1, indicies, xmm0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ break;
+ }
+ case 3: {
+ const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[2]);
+ code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
+ if (is_defaults_zero) {
+ code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
+ ctx.reg_alloc.DefineValue(inst, indicies);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermi2b(indicies, xmm0, xmm_table1);
+ code.vmovdqu8(result | k1, indicies);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ break;
+ }
+ case 4: {
+ const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
+ const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
+ code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
+ code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper);
+ if (is_defaults_zero) {
+ code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
+ ctx.reg_alloc.DefineValue(inst, indicies);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermi2b(indicies, xmm0, xmm_table1);
+ code.vmovdqu8(result | k1, indicies);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ break;
+ }
+ return;
+ }
+
+ const std::array<u64, 5> sat_const{
+ 0,
+ 0x7878787878787878,
+ 0x7070707070707070,
+ 0x6868686868686868,
+ 0x6060606060606060,
+ };
+
+ if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.xorps(result, result);
+ code.movsd(result, xmm_table0);
+ code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ code.pshufb(result, indicies);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+
+ code.punpcklqdq(xmm_table0, xmm_table0_upper);
+ code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ code.pshufb(xmm_table0, indicies);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+
+ if (table_size == 2) {
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ code.punpcklqdq(xmm_table0, xmm_table0_upper);
+ ctx.reg_alloc.Release(xmm_table0_upper);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
+ }
+ code.pshufb(xmm_table0, indicies);
+ code.pblendvb(xmm_table0, defaults);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
+
+ {
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ code.punpcklqdq(xmm_table0, xmm_table0_upper);
+ ctx.reg_alloc.Release(xmm_table0_upper);
+ }
+ if (table_size == 3) {
+ code.pxor(xmm0, xmm0);
+ code.punpcklqdq(xmm_table1, xmm0);
+ } else {
+ ASSERT(table_size == 4);
+ const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
+ code.punpcklqdq(xmm_table1, xmm_table1_upper);
+ ctx.reg_alloc.Release(xmm_table1_upper);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ }
+ code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF));
+ code.pshufb(xmm_table0, xmm0);
+ code.pshufb(xmm_table1, indicies);
+ code.pblendvb(xmm_table0, xmm_table1);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]);
+
+ {
+ const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]);
+ code.punpcklqdq(xmm_table0, xmm_table0_upper);
+ ctx.reg_alloc.Release(xmm_table0_upper);
+ }
+ if (table_size == 4) {
+ const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]);
+ code.punpcklqdq(xmm_table1, xmm_table1_upper);
+ ctx.reg_alloc.Release(xmm_table1_upper);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
+ }
+ code.pshufb(xmm_table0, indicies);
+ code.pshufb(xmm_table1, indicies);
+ code.pblendvb(xmm_table0, xmm_table1);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF));
+ }
+ code.pblendvb(xmm_table0, defaults);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ const u32 stack_space = static_cast<u32>(6 * 8);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ for (size_t i = 0; i < table_size; ++i) {
+ const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]);
+ code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
+ ctx.reg_alloc.Release(table_value);
+ }
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
+ code.mov(code.ABI_PARAM4.cvt32(), table_size);
+ code.movq(qword[code.ABI_PARAM2], defaults);
+ code.movq(qword[code.ABI_PARAM3], indicies);
+
+ code.CallLambda(
+ [](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) {
+ for (size_t i = 0; i < result.size(); ++i) {
+ const size_t index = indicies[i] / table[0].size();
+ const size_t elem = indicies[i] % table[0].size();
+ if (index < table_size) {
+ result[i] = table[index][elem];
+ }
+ }
+ });
+
+ code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
+ ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst());
+
+ const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); });
+ const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+
+ code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
+ code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan);
+
+ // Handle vector-table 0,1
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]);
+
+ code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
+
+ ctx.reg_alloc.Release(xmm_table0);
+ ctx.reg_alloc.Release(xmm_table1);
+
+ // Handle vector-table 2,3
+ const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(table[2]);
+ const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(table[3]);
+
+ code.kandnw(k1, k1, k2);
+ code.vpermi2b(indicies | k1, xmm_table2, xmm_table3);
+
+ if (is_defaults_zero) {
+ code.vmovdqu8(indicies | k2 | T_z, indicies);
+ ctx.reg_alloc.DefineValue(inst, indicies);
+ } else {
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vmovdqu8(defaults | k2, indicies);
+ ctx.reg_alloc.DefineValue(inst, defaults);
+ }
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+
+ code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
+ code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan);
+
+ // Handle vector-table 0,1
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]);
+
+ code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
+
+ ctx.reg_alloc.Release(xmm_table0);
+ ctx.reg_alloc.Release(xmm_table1);
+
+ // Handle vector-table 2
+ const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(table[2]);
+
+ code.kandnw(k1, k1, k2);
+ code.vpermb(indicies | k1, indicies, xmm_table2);
+
+ if (is_defaults_zero) {
+ code.vmovdqu8(indicies | k2 | T_z, indicies);
+ ctx.reg_alloc.DefineValue(inst, indicies);
+ } else {
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vmovdqu8(defaults | k2, indicies);
+ ctx.reg_alloc.DefineValue(inst, defaults);
+ }
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]);
+
+ code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
+
+ if (is_defaults_zero) {
+ code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1);
+ ctx.reg_alloc.DefineValue(inst, indicies);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermi2b(indicies, xmm_table0, xmm_table1);
+ code.vmovdqu8(result | k1, indicies);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]);
+
+ code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan);
+
+ if (is_defaults_zero) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ code.vpermb(result | k1 | T_z, indicies, xmm_table0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.vpermb(result | k1, indicies, xmm_table0);
+ ctx.reg_alloc.DefineValue(inst, result);
+ }
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+
+ code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
+ code.pshufb(xmm_table0, indicies);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
+ }
+ code.pshufb(xmm_table0, indicies);
+ code.pblendvb(xmm_table0, defaults);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]);
+ const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]);
+ const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]);
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
+ } else {
+ code.movaps(xmm0, indicies);
+ code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
+ }
+ code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0x6060606060606060));
+ code.pshufb(xmm_table0, xmm0);
+ code.pshufb(xmm_table1, indicies);
+ code.pblendvb(xmm_table0, xmm_table1);
+
+ ctx.reg_alloc.DefineValue(inst, xmm_table0);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm masked = xmm16;
+
+ code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
+
+ for (size_t i = 0; i < table_size; ++i) {
+ const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]);
+ const Xbyak::Opmask table_mask = k1;
+ const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
+
+ code.vpcmpeqb(table_mask, masked, code.Const(xword, table_index, table_index));
+
+ if (table_index == 0 && is_defaults_zero) {
+ code.vpshufb(result | table_mask | T_z, xmm_table, indicies);
+ } else {
+ code.vpshufb(result | table_mask, xmm_table, indicies);
+ }
+
+ ctx.reg_alloc.Release(xmm_table);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
+ code.pand(masked, indicies);
+
+ for (size_t i = 0; i < table_size; ++i) {
+ const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]);
+
+ const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
+
+ if (table_index == 0) {
+ code.pxor(xmm0, xmm0);
+ code.pcmpeqb(xmm0, masked);
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpcmpeqb(xmm0, masked, code.Const(xword, table_index, table_index));
+ } else {
+ code.movaps(xmm0, code.Const(xword, table_index, table_index));
+ code.pcmpeqb(xmm0, masked);
+ }
+ code.pshufb(xmm_table, indicies);
+ code.pblendvb(result, xmm_table);
+
+ ctx.reg_alloc.Release(xmm_table);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const u32 stack_space = static_cast<u32>((table_size + 2) * 16);
+ ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE);
+ for (size_t i = 0; i < table_size; ++i) {
+ const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]);
+ code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
+ ctx.reg_alloc.Release(table_value);
+ }
+ const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]);
+ code.mov(code.ABI_PARAM4.cvt32(), table_size);
+ code.movaps(xword[code.ABI_PARAM2], defaults);
+ code.movaps(xword[code.ABI_PARAM3], indicies);
+
+ code.CallLambda(
+ [](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) {
+ for (size_t i = 0; i < result.size(); ++i) {
+ const size_t index = indicies[i] / table[0].size();
+ const size_t elem = indicies[i] % table[0].size();
+ if (index < table_size) {
+ result[i] = table[index][elem];
+ }
+ }
+ });
+
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]);
+ ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const bool part = args[2].GetImmediateU1();
+
+ if (!part) {
+ code.pand(lower, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.psllw(upper, 8);
+ } else {
+ code.psrlw(lower, 8);
+ code.pand(upper, code.Const(xword, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00));
+ }
+ code.por(lower, upper);
+
+ ctx.reg_alloc.DefineValue(inst, lower);
+}
+
+void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const bool part = args[2].GetImmediateU1();
+
+ if (!part) {
+ code.pand(lower, code.Const(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF));
+ code.pslld(upper, 16);
+ } else {
+ code.psrld(lower, 16);
+ code.pand(upper, code.Const(xword, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000));
+ }
+ code.por(lower, upper);
+
+ ctx.reg_alloc.DefineValue(inst, lower);
+}
+
+void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]);
+ const bool part = args[2].GetImmediateU1();
+
+ code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101);
+ code.pshufd(lower, lower, 0b11011000);
+
+ ctx.reg_alloc.DefineValue(inst, lower);
+}
+
+void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]);
+ const bool part = args[2].GetImmediateU1();
+
+ code.shufpd(lower, upper, !part ? 0b00 : 0b11);
+
+ ctx.reg_alloc.DefineValue(inst, lower);
+}
+
+static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+
+ switch (esize) {
+ case 8: {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.movdqa(temp, x);
+ code.psubusb(temp, y);
+ code.psubusb(y, x);
+ code.por(temp, y);
+ break;
+ }
+ case 16: {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.movdqa(temp, x);
+ code.psubusw(temp, y);
+ code.psubusw(y, x);
+ code.por(temp, y);
+ break;
+ }
+ case 32:
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+ code.movdqa(temp, x);
+ code.pminud(x, y);
+ code.pmaxud(temp, y);
+ code.psubd(temp, x);
+ } else {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.pxor(x, temp);
+ code.pxor(y, temp);
+ code.movdqa(temp, x);
+ code.psubd(temp, y);
+ code.pcmpgtd(y, x);
+ code.psrld(y, 1);
+ code.pxor(temp, y);
+ code.psubd(temp, y);
+ }
+ break;
+ }
+
+ ctx.reg_alloc.DefineValue(inst, temp);
+}
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code);
+}
+
+void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
+ const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
+ const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+ if (upper_inst) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmulhuw(result, x, y);
+ } else {
+ code.movdqa(result, x);
+ code.pmulhuw(result, y);
+ }
+
+ ctx.reg_alloc.DefineValue(upper_inst, result);
+ }
+
+ if (lower_inst) {
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpmullw(result, x, y);
+ } else {
+ code.movdqa(result, x);
+ code.pmullw(result, y);
+ }
+ ctx.reg_alloc.DefineValue(lower_inst, result);
+ }
+}
+
+void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
+ const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp);
+ const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmulld(result, x, y);
+
+ ctx.reg_alloc.DefineValue(lower_inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ if (lower_inst) {
+ const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
+ code.vpmulld(lower_result, x, y);
+ ctx.reg_alloc.DefineValue(lower_inst, lower_result);
+ }
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.vpmuludq(result, x, y);
+ code.vpsrlq(x, x, 32);
+ code.vpsrlq(y, y, 32);
+ code.vpmuludq(x, x, y);
+ code.shufps(result, x, 0b11011101);
+
+ ctx.reg_alloc.DefineValue(upper_inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
+
+ // calculate unsigned multiply
+ code.movdqa(tmp, x);
+ code.pmuludq(tmp, y);
+ code.psrlq(x, 32);
+ code.psrlq(y, 32);
+ code.pmuludq(x, y);
+
+ // put everything into place
+ code.pcmpeqw(upper_result, upper_result);
+ code.pcmpeqw(lower_result, lower_result);
+ code.psllq(upper_result, 32);
+ code.psrlq(lower_result, 32);
+ code.pand(upper_result, x);
+ code.pand(lower_result, tmp);
+ code.psrlq(tmp, 32);
+ code.psllq(x, 32);
+ code.por(upper_result, tmp);
+ code.por(lower_result, x);
+
+ if (upper_inst) {
+ ctx.reg_alloc.DefineValue(upper_inst, upper_result);
+ }
+ if (lower_inst) {
+ ctx.reg_alloc.DefineValue(lower_inst, lower_result);
+ }
+}
+
+void EmitX64::EmitVectorUnsignedRecipEstimate(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a) {
+ for (size_t i = 0; i < result.size(); i++) {
+ if ((a[i] & 0x80000000) == 0) {
+ result[i] = 0xFFFFFFFF;
+ continue;
+ }
+
+ const u32 input = mcl::bit::get_bits<23, 31>(a[i]);
+ const u32 estimate = Common::RecipEstimate(input);
+
+ result[i] = (0b100000000 | estimate) << 23;
+ }
+ });
+}
+
+void EmitX64::EmitVectorUnsignedRecipSqrtEstimate(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a) {
+ for (size_t i = 0; i < result.size(); i++) {
+ if ((a[i] & 0xC0000000) == 0) {
+ result[i] = 0xFFFFFFFF;
+ continue;
+ }
+
+ const u32 input = mcl::bit::get_bits<23, 31>(a[i]);
+ const u32 estimate = Common::RecipSqrtEstimate(input);
+
+ result[i] = (0b100000000 | estimate) << 23;
+ }
+ });
+}
+
+// Simple generic case for 8, 16, and 32-bit values. 64-bit values
+// will need to be special-cased as we can't simply use a larger integral size.
+template<typename T, typename U = std::make_unsigned_t<T>>
+static bool EmitVectorUnsignedSaturatedAccumulateSigned(VectorArray<U>& result, const VectorArray<T>& lhs, const VectorArray<T>& rhs) {
+ static_assert(std::is_signed_v<T>, "T must be signed.");
+ static_assert(mcl::bitsizeof<T> < 64, "T must be less than 64 bits in size.");
+
+ bool qc_flag = false;
+
+ for (size_t i = 0; i < result.size(); i++) {
+ // We treat rhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension.
+ // We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move.
+ const s64 x = s64{lhs[i]};
+ const s64 y = static_cast<s64>(static_cast<std::make_unsigned_t<U>>(rhs[i]));
+ const s64 sum = x + y;
+
+ if (sum > std::numeric_limits<U>::max()) {
+ result[i] = std::numeric_limits<U>::max();
+ qc_flag = true;
+ } else if (sum < 0) {
+ result[i] = std::numeric_limits<U>::min();
+ qc_flag = true;
+ } else {
+ result[i] = static_cast<U>(sum);
+ }
+ }
+
+ return qc_flag;
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s8>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s16>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s32>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<u64>& rhs) {
+ bool qc_flag = false;
+
+ for (size_t i = 0; i < result.size(); i++) {
+ const u64 x = lhs[i];
+ const u64 y = rhs[i];
+ const u64 res = x + y;
+
+ // Check sign bits to determine if an overflow occurred.
+ if ((~x & y & ~res) & 0x8000000000000000) {
+ result[i] = UINT64_MAX;
+ qc_flag = true;
+ } else if ((x & ~y & res) & 0x8000000000000000) {
+ result[i] = 0;
+ qc_flag = true;
+ } else {
+ result[i] = res;
+ }
+ }
+
+ return qc_flag;
+ });
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedNarrow16(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u16>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const u16 saturated = std::clamp<u16>(a[i], 0, 0xFF);
+ result[i] = static_cast<u8>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedNarrow32(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u32>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const u32 saturated = std::clamp<u32>(a[i], 0, 0xFFFF);
+ result[i] = static_cast<u16>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedNarrow64(EmitContext& ctx, IR::Inst* inst) {
+ EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u64>& a) {
+ result = {};
+ bool qc_flag = false;
+ for (size_t i = 0; i < a.size(); ++i) {
+ const u64 saturated = std::clamp<u64>(a[i], 0, 0xFFFFFFFF);
+ result[i] = static_cast<u32>(saturated);
+ qc_flag |= saturated != a[i];
+ }
+ return qc_flag;
+ });
+}
+
+template<typename T, typename S = std::make_signed_t<T>>
+static bool VectorUnsignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorArray<T>& data, const VectorArray<T>& shift_values) {
+ static_assert(std::is_unsigned_v<T>, "T must be an unsigned type.");
+
+ bool qc_flag = false;
+
+ constexpr size_t bit_size = mcl::bitsizeof<T>;
+ constexpr S negative_bit_size = -static_cast<S>(bit_size);
+
+ for (size_t i = 0; i < dst.size(); i++) {
+ const T element = data[i];
+ const S shift = std::clamp(static_cast<S>(mcl::bit::sign_extend<8>(static_cast<T>(shift_values[i] & 0xFF))),
+ negative_bit_size, std::numeric_limits<S>::max());
+
+ if (element == 0 || shift <= negative_bit_size) {
+ dst[i] = 0;
+ } else if (shift < 0) {
+ dst[i] = static_cast<T>(element >> -shift);
+ } else if (shift >= static_cast<S>(bit_size)) {
+ dst[i] = std::numeric_limits<T>::max();
+ qc_flag = true;
+ } else {
+ const T shifted = element << shift;
+
+ if ((shifted >> shift) != element) {
+ dst[i] = std::numeric_limits<T>::max();
+ qc_flag = true;
+ } else {
+ dst[i] = shifted;
+ }
+ }
+ }
+
+ return qc_flag;
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u8>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u16>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u32>);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u64>);
+}
+
+void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmovzxbw(a, a);
+ } else {
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ code.pxor(zeros, zeros);
+ code.punpcklbw(a, zeros);
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmovzxwd(a, a);
+ } else {
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ code.pxor(zeros, zeros);
+ code.punpcklwd(a, zeros);
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.pmovzxdq(a, a);
+ } else {
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ code.pxor(zeros, zeros);
+ code.punpckldq(a, zeros);
+ }
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm();
+ code.pxor(zeros, zeros);
+ code.punpcklqdq(a, zeros);
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ code.movq(a, a); // TODO: !IsLastUse
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) {
+ const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm();
+ code.pxor(a, a);
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
new file mode 100644
index 0000000000..b8aa3eb653
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -0,0 +1,2182 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include <mcl/assert.hpp>
+#include <mcl/mp/metavalue/lift_value.hpp>
+#include <mcl/mp/typelist/cartesian_product.hpp>
+#include <mcl/mp/typelist/get.hpp>
+#include <mcl/mp/typelist/lift_sequence.hpp>
+#include <mcl/mp/typelist/list.hpp>
+#include <mcl/mp/typelist/lower_to_tuple.hpp>
+#include <mcl/type_traits/function_info.hpp>
+#include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/constants.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/op.h"
+#include "dynarmic/common/fp/util.h"
+#include "dynarmic/common/lut_from_list.h"
+#include "dynarmic/interface/optimization_flags.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+namespace mp = mcl::mp;
+
+namespace {
+
+#define FCODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (fsize == 32) { \
+ code.NAME##s(args...); \
+ } else { \
+ code.NAME##d(args...); \
+ } \
+ }
+#define ICODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (fsize == 32) { \
+ code.NAME##d(args...); \
+ } else { \
+ code.NAME##q(args...); \
+ } \
+ }
+
+template<typename Lambda>
+void MaybeStandardFPSCRValue(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, Lambda lambda) {
+ const bool switch_mxcsr = ctx.FPCR(fpcr_controlled) != ctx.FPCR();
+
+ if (switch_mxcsr && !ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) {
+ code.EnterStandardASIMD();
+ lambda();
+ code.LeaveStandardASIMD();
+ } else {
+ lambda();
+ }
+}
+
+template<size_t fsize, template<typename> class Indexer, size_t narg>
+struct NaNHandler {
+public:
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ using function_type = void (*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR);
+
+ static function_type GetDefault() {
+ return GetDefaultImpl(std::make_index_sequence<narg - 1>{});
+ }
+
+private:
+ template<size_t... argi>
+ static function_type GetDefaultImpl(std::index_sequence<argi...>) {
+ const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) {
+ VectorArray<FPT>& result = values[0];
+ for (size_t elementi = 0; elementi < result.size(); ++elementi) {
+ const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...);
+ if (auto r = FP::ProcessNaNs(std::get<argi>(current_values)...)) {
+ result[elementi] = *r;
+ } else if (FP::IsNaN(result[elementi])) {
+ result[elementi] = FP::FPInfo<FPT>::DefaultNaN();
+ }
+ }
+ };
+
+ return static_cast<function_type>(result);
+ }
+};
+
+template<size_t fsize, size_t nargs, typename NaNHandler>
+void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) {
+ static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.ptest(nan_mask, nan_mask);
+ } else {
+ const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32();
+ code.movmskps(bitmask, nan_mask);
+ code.cmp(bitmask, 0);
+ }
+
+ SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+
+ code.jnz(*nan, code.T_NEAR);
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*nan);
+
+ const Xbyak::Xmm result = xmms[0];
+
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+
+ const size_t stack_space = xmms.size() * 16;
+ code.sub(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE));
+ for (size_t i = 0; i < xmms.size(); ++i) {
+ code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
+ }
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.mov(code.ABI_PARAM2, ctx.FPCR(fpcr_controlled).Value());
+
+ code.CallFunction(nan_handler);
+
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.add(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE));
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ });
+}
+
+template<size_t fsize>
+Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) {
+ return code.BConst<fsize>(xword, value);
+}
+
+template<size_t fsize, u64 value>
+Xbyak::Address GetVectorOf(BlockOfCode& code) {
+ return code.BConst<fsize>(xword, value);
+}
+
+template<size_t fsize>
+Xbyak::Address GetNaNVector(BlockOfCode& code) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ return GetVectorOf<fsize, FP::FPInfo<FPT>::DefaultNaN()>(code);
+}
+
+template<size_t fsize>
+Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ return GetVectorOf<fsize, FP::FPInfo<FPT>::Zero(true)>(code);
+}
+
+template<size_t fsize>
+Xbyak::Address GetNonSignMaskVector(BlockOfCode& code) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT non_sign_mask = FP::FPInfo<FPT>::exponent_mask | FP::FPInfo<FPT>::mantissa_mask;
+ return GetVectorOf<fsize, non_sign_mask>(code);
+}
+
+template<size_t fsize>
+Xbyak::Address GetSmallestNormalVector(BlockOfCode& code) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT smallest_normal_number = FP::FPValue<FPT, false, FP::FPInfo<FPT>::exponent_min, 1>();
+ return GetVectorOf<fsize, smallest_normal_number>(code);
+}
+
+template<size_t fsize, bool sign, int exponent, mcl::unsigned_integer_of_size<fsize> value>
+Xbyak::Address GetVectorOf(BlockOfCode& code) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ return GetVectorOf<fsize, FP::FPValue<FPT, sign, exponent, value>()>(code);
+}
+
+template<size_t fsize>
+void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
+ if (fpcr.DN()) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ const Xbyak::Opmask nan_mask = k1;
+ FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN));
+ FCODE(vblendmp)(result | nan_mask, result, GetNaNVector<fsize>(code));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm nan_mask = xmm0;
+ FCODE(vcmpunordp)(nan_mask, result, result);
+ FCODE(blendvp)(result, GetNaNVector<fsize>(code));
+ } else {
+ const Xbyak::Xmm nan_mask = xmm0;
+ code.movaps(nan_mask, result);
+ FCODE(cmpordp)(nan_mask, nan_mask);
+ code.andps(result, nan_mask);
+ code.andnps(nan_mask, GetNaNVector<fsize>(code));
+ code.orps(result, nan_mask);
+ }
+ }
+}
+
+template<size_t fsize>
+void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
+ const Xbyak::Xmm nan_mask = xmm0;
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
+ FpFixup::PosZero);
+ FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpordp)(nan_mask, result, result);
+ FCODE(vandp)(result, result, nan_mask);
+ } else {
+ code.movaps(nan_mask, result);
+ FCODE(cmpordp)(nan_mask, nan_mask);
+ code.andps(result, nan_mask);
+ }
+}
+
+template<size_t fsize>
+void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
+ if (fpcr.FZ()) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ constexpr u32 denormal_to_zero = FixupLUT(
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src,
+ FpFixup::Norm_Src);
+
+ FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero));
+
+ for (const Xbyak::Xmm& xmm : to_daz) {
+ FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
+ }
+ return;
+ }
+
+ if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
+ code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
+ } else {
+ code.xorps(tmp, tmp);
+ }
+ for (const Xbyak::Xmm& xmm : to_daz) {
+ FCODE(addp)(xmm, tmp);
+ }
+ }
+}
+
+template<typename T>
+struct DefaultIndexer {
+ std::tuple<T> operator()(size_t i, const VectorArray<T>& a) {
+ return std::make_tuple(a[i]);
+ }
+
+ std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) {
+ return std::make_tuple(a[i], b[i]);
+ }
+
+ std::tuple<T, T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b, const VectorArray<T>& c) {
+ return std::make_tuple(a[i], b[i], c[i]);
+ }
+};
+
+template<typename T>
+struct PairedIndexer {
+ std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) {
+ constexpr size_t halfway = std::tuple_size_v<VectorArray<T>> / 2;
+ const size_t which_array = i / halfway;
+ i %= halfway;
+ switch (which_array) {
+ case 0:
+ return std::make_tuple(a[2 * i], a[2 * i + 1]);
+ case 1:
+ return std::make_tuple(b[2 * i], b[2 * i + 1]);
+ }
+ UNREACHABLE();
+ }
+};
+
+template<typename T>
+struct PairedLowerIndexer {
+ std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) {
+ constexpr size_t array_size = std::tuple_size_v<VectorArray<T>>;
+ if constexpr (array_size == 4) {
+ switch (i) {
+ case 0:
+ return std::make_tuple(a[0], a[1]);
+ case 1:
+ return std::make_tuple(b[0], b[1]);
+ default:
+ return std::make_tuple(0, 0);
+ }
+ } else if constexpr (array_size == 2) {
+ if (i == 0) {
+ return std::make_tuple(a[0], b[0]);
+ }
+ return std::make_tuple(0, 0);
+ } else {
+ UNREACHABLE();
+ }
+ }
+};
+
+template<size_t fsize, template<typename> class Indexer, size_t fpcr_controlled_arg_index = 1, typename Function>
+void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 2>::function_type nan_handler = NaNHandler<fsize, Indexer, 2>::GetDefault()) {
+ static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
+
+ if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ Xbyak::Xmm result;
+
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ (code.*fn)(result);
+ });
+ } else {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ result = ctx.reg_alloc.ScratchXmm();
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ fn(result, xmm_a);
+ });
+ }
+
+ if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
+
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ code.movaps(result, xmm_a);
+ (code.*fn)(result);
+ } else {
+ fn(result, xmm_a);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpunordp)(nan_mask, result, result);
+ } else {
+ code.movaps(nan_mask, result);
+ FCODE(cmpunordp)(nan_mask, nan_mask);
+ }
+
+ HandleNaNs<fsize, 1>(code, ctx, fpcr_controlled, {result, xmm_a}, nan_mask, nan_handler);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+enum class CheckInputNaN {
+ Yes,
+ No,
+};
+
+template<size_t fsize, template<typename> class Indexer, typename Function>
+void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, CheckInputNaN check_input_nan = CheckInputNaN::No, typename NaNHandler<fsize, Indexer, 3>::function_type nan_handler = NaNHandler<fsize, Indexer, 3>::GetDefault()) {
+ static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64");
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ (code.*fn)(xmm_a, xmm_b);
+ });
+ } else {
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ fn(xmm_a, xmm_b);
+ });
+ }
+
+ if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
+ }
+
+ ctx.reg_alloc.DefineValue(inst, xmm_a);
+ return;
+ }
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, xmm_a);
+
+ if (check_input_nan == CheckInputNaN::Yes) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpunordp)(nan_mask, xmm_a, xmm_b);
+ } else {
+ code.movaps(nan_mask, xmm_b);
+ FCODE(cmpunordp)(nan_mask, xmm_a);
+ }
+ }
+
+ if constexpr (std::is_member_function_pointer_v<Function>) {
+ (code.*fn)(result, xmm_b);
+ } else {
+ fn(result, xmm_b);
+ }
+
+ if (check_input_nan == CheckInputNaN::Yes) {
+ FCODE(cmpunordp)(nan_mask, result);
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpunordp)(nan_mask, result, result);
+ } else {
+ code.movaps(nan_mask, result);
+ FCODE(cmpunordp)(nan_mask, nan_mask);
+ }
+
+ HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Lambda lambda, bool fpcr_controlled) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+
+ const u32 fpcr = ctx.FPCR(fpcr_controlled).Value();
+
+ constexpr u32 stack_space = 2 * 16;
+ code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.mov(code.ABI_PARAM3.cvt32(), fpcr);
+ code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.CallFunction(fn);
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+
+ code.add(rsp, stack_space + ABI_SHADOW_SPACE);
+}
+
+template<size_t fpcr_controlled_arg_index = 1, typename Lambda>
+void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
+
+ EmitTwoOpFallbackWithoutRegAlloc(code, ctx, result, arg1, lambda, fpcr_controlled);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename Lambda>
+void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Lambda lambda, bool fpcr_controlled) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+
+ const u32 fpcr = ctx.FPCR(fpcr_controlled).Value();
+
+#ifdef _WIN32
+ constexpr u32 stack_space = 4 * 16;
+ code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
+ code.mov(code.ABI_PARAM4.cvt32(), fpcr);
+ code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], rax);
+#else
+ constexpr u32 stack_space = 3 * 16;
+ code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+ code.mov(code.ABI_PARAM4.cvt32(), fpcr);
+ code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+#endif
+
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.movaps(xword[code.ABI_PARAM3], arg2);
+ code.CallFunction(fn);
+
+#ifdef _WIN32
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+#else
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+#endif
+
+ code.add(rsp, stack_space + ABI_SHADOW_SPACE);
+}
+
+template<typename Lambda>
+void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, lambda, fpcr_controlled);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+enum class LoadPreviousResult {
+ Yes,
+ No,
+};
+
+template<LoadPreviousResult load_previous_result = LoadPreviousResult::No, typename Lambda>
+void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Xbyak::Xmm arg3, Lambda lambda, bool fpcr_controlled) {
+ const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
+
+#ifdef _WIN32
+ constexpr u32 stack_space = 5 * 16;
+ code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
+ code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 4 * 16]);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], ctx.FPCR(fpcr_controlled).Value());
+ code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+ code.mov(qword[rsp + ABI_SHADOW_SPACE + 8], rax);
+#else
+ constexpr u32 stack_space = 4 * 16;
+ code.sub(rsp, stack_space + ABI_SHADOW_SPACE);
+ code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+ code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+ code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]);
+ code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]);
+ code.mov(code.ABI_PARAM5.cvt32(), ctx.FPCR(fpcr_controlled).Value());
+ code.lea(code.ABI_PARAM6, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]);
+#endif
+
+ if constexpr (load_previous_result == LoadPreviousResult::Yes) {
+ code.movaps(xword[code.ABI_PARAM1], result);
+ }
+ code.movaps(xword[code.ABI_PARAM2], arg1);
+ code.movaps(xword[code.ABI_PARAM3], arg2);
+ code.movaps(xword[code.ABI_PARAM4], arg3);
+ code.CallFunction(fn);
+
+#ifdef _WIN32
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]);
+#else
+ code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
+#endif
+
+ code.add(rsp, stack_space + ABI_SHADOW_SPACE);
+}
+
+template<typename Lambda>
+void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ ctx.reg_alloc.EndOfAllocScope();
+ ctx.reg_alloc.HostCall(nullptr);
+
+ EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, arg3, lambda, fpcr_controlled);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+} // anonymous namespace
+
+template<size_t fsize>
+void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ code.andps(a, GetNonSignMaskVector<fsize>(code));
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorAbs<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorAbs<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorAbs<64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::addps);
+}
+
+void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::addpd);
+}
+
+void EmitX64::EmitFPVectorDiv32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::divps);
+}
+
+void EmitX64::EmitFPVectorDiv64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::divpd);
+}
+
+void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& op1, const VectorArray<u16>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPCompareEQ(op1[i], op2[i], fpcr, fpsr) ? 0xFFFF : 0;
+ }
+ });
+}
+
+void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmpeqps(a, b);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmpeqpd(a, b);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool fpcr_controlled = inst->GetArg(2).GetU1();
+
+ if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]);
+
+ code.vcvtph2ps(result, value);
+ ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](VectorArray<u32>& output, const VectorArray<u16>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
+
+ for (size_t i = 0; i < output.size(); ++i) {
+ output[i] = FP::FPConvert<u32, u16>(input[i], fpcr, rounding_mode, fpsr);
+ }
+ })};
+ },
+ mp::cartesian_product<rounding_list>{});
+
+ EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
+}
+
+void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const int fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.cvtdq2ps(xmm, xmm);
+ if (fbits != 0) {
+ code.mulps(xmm, GetVectorOf<32>(code, static_cast<u32>(127 - fbits) << 23));
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, xmm);
+}
+
+void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const int fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ code.vcvtqq2pd(xmm, xmm);
+ } else if (code.HasHostFeature(HostFeature::SSE41)) {
+ const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+
+ // First quadword
+ code.movq(tmp, xmm);
+ code.cvtsi2sd(xmm, tmp);
+
+ // Second quadword
+ code.pextrq(tmp, xmm, 1);
+ code.cvtsi2sd(xmm_tmp, tmp);
+
+ // Combine
+ code.unpcklpd(xmm, xmm_tmp);
+ } else {
+ const Xbyak::Xmm high_xmm = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr();
+
+ // First quadword
+ code.movhlps(high_xmm, xmm);
+ code.movq(tmp, xmm);
+ code.cvtsi2sd(xmm, tmp);
+
+ // Second quadword
+ code.movq(tmp, high_xmm);
+ code.cvtsi2sd(xmm_tmp, tmp);
+
+ // Combine
+ code.unpcklpd(xmm, xmm_tmp);
+ }
+
+ if (fbits != 0) {
+ code.mulpd(xmm, GetVectorOf<64>(code, static_cast<u64>(1023 - fbits) << 52));
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, xmm);
+}
+
+void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const int fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vcvtudq2ps(xmm, xmm);
+ } else {
+ const Xbyak::Address mem_4B000000 = code.BConst<32>(xword, 0x4B000000);
+ const Xbyak::Address mem_53000000 = code.BConst<32>(xword, 0x53000000);
+ const Xbyak::Address mem_D3000080 = code.BConst<32>(xword, 0xD3000080);
+
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010);
+ code.vpsrld(xmm, xmm, 16);
+ code.vpblendw(xmm, xmm, mem_53000000, 0b10101010);
+ code.vaddps(xmm, xmm, mem_D3000080);
+ code.vaddps(xmm, tmp, xmm);
+ } else {
+ const Xbyak::Address mem_0xFFFF = code.BConst<32>(xword, 0x0000FFFF);
+
+ code.movdqa(tmp, mem_0xFFFF);
+
+ code.pand(tmp, xmm);
+ code.por(tmp, mem_4B000000);
+ code.psrld(xmm, 16);
+ code.por(xmm, mem_53000000);
+ code.addps(xmm, mem_D3000080);
+ code.addps(xmm, tmp);
+ }
+ }
+
+ if (fbits != 0) {
+ code.mulps(xmm, GetVectorOf<32>(code, static_cast<u32>(127 - fbits) << 23));
+ }
+
+ if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
+ code.pand(xmm, code.BConst<32>(xword, 0x7FFFFFFF));
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, xmm);
+}
+
+void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const int fbits = args[1].GetImmediateU8();
+ const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+ const bool fpcr_controlled = args[3].GetImmediateU1();
+ ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode());
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ code.vcvtuqq2pd(xmm, xmm);
+ } else {
+ const Xbyak::Address unpack = code.Const(xword, 0x4530000043300000, 0);
+ const Xbyak::Address subtrahend = code.Const(xword, 0x4330000000000000, 0x4530000000000000);
+
+ const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vmovapd(unpack_reg, unpack);
+ code.vmovapd(subtrahend_reg, subtrahend);
+
+ code.vunpcklps(tmp1, xmm, unpack_reg);
+ code.vsubpd(tmp1, tmp1, subtrahend_reg);
+
+ code.vpermilps(xmm, xmm, 0b01001110);
+
+ code.vunpcklps(xmm, xmm, unpack_reg);
+ code.vsubpd(xmm, xmm, subtrahend_reg);
+
+ code.vhaddpd(xmm, tmp1, xmm);
+ } else {
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ code.movapd(unpack_reg, unpack);
+ code.movapd(subtrahend_reg, subtrahend);
+
+ code.pshufd(tmp1, xmm, 0b01001110);
+
+ code.punpckldq(xmm, unpack_reg);
+ code.subpd(xmm, subtrahend_reg);
+ code.pshufd(tmp2, xmm, 0b01001110);
+ code.addpd(xmm, tmp2);
+
+ code.punpckldq(tmp1, unpack_reg);
+ code.subpd(tmp1, subtrahend_reg);
+
+ code.pshufd(unpack_reg, tmp1, 0b01001110);
+ code.addpd(unpack_reg, tmp1);
+
+ code.unpcklpd(xmm, unpack_reg);
+ }
+ }
+
+ if (fbits != 0) {
+ code.mulpd(xmm, GetVectorOf<64>(code, static_cast<u64>(1023 - fbits) << 52));
+ }
+
+ if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) {
+ code.pand(xmm, code.BConst<64>(xword, 0x7FFFFFFFFFFFFFFF));
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, xmm);
+}
+
+void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmpltps(b, a);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, b);
+}
+
+void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmpltpd(b, a);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, b);
+}
+
+void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmpleps(b, a);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, b);
+}
+
+void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+ const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0);
+ code.cmplepd(b, a);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, b);
+}
+
+template<size_t fsize, bool is_max>
+static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool fpcr_controlled = inst->GetArg(2).GetU1();
+
+ if (ctx.FPCR(fpcr_controlled).DN()) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
+
+ const Xbyak::Xmm mask = xmm0;
+ const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ constexpr FpRangeSelect range_select = is_max ? FpRangeSelect::Max : FpRangeSelect::Min;
+ FCODE(vcmpp)(k1, result, xmm_b, Cmp::Unordered_Q);
+ FCODE(vrangep)(result, result, xmm_b, FpRangeLUT(range_select, FpRangeSign::Preserve));
+ FCODE(vblendmp)(result | k1, result, GetNaNVector<fsize>(code));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpeqp)(mask, result, xmm_b);
+ FCODE(vcmpunordp)(nan_mask, result, xmm_b);
+ if constexpr (is_max) {
+ FCODE(vandp)(eq, result, xmm_b);
+ FCODE(vmaxp)(result, result, xmm_b);
+ } else {
+ FCODE(vorp)(eq, result, xmm_b);
+ FCODE(vminp)(result, result, xmm_b);
+ }
+ FCODE(blendvp)(result, eq);
+ FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask);
+ } else {
+ code.movaps(mask, result);
+ code.movaps(eq, result);
+ code.movaps(nan_mask, result);
+ FCODE(cmpneqp)(mask, xmm_b);
+ FCODE(cmpordp)(nan_mask, xmm_b);
+
+ if constexpr (is_max) {
+ code.andps(eq, xmm_b);
+ FCODE(maxp)(result, xmm_b);
+ } else {
+ code.orps(eq, xmm_b);
+ FCODE(minp)(result, xmm_b);
+ }
+
+ code.andps(result, mask);
+ code.andnps(mask, eq);
+ code.orps(result, mask);
+
+ code.andps(result, nan_mask);
+ code.andnps(nan_mask, GetNaNVector<fsize>(code));
+ code.orps(result, nan_mask);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+
+ return;
+ }
+
+ EmitThreeOpVectorOperation<fsize, DefaultIndexer>(
+ code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b) {
+ const Xbyak::Xmm mask = xmm0;
+ const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
+
+ if (ctx.FPCR(fpcr_controlled).FZ()) {
+ const Xbyak::Xmm prev_xmm_b = xmm_b;
+ xmm_b = ctx.reg_alloc.ScratchXmm();
+ code.movaps(xmm_b, prev_xmm_b);
+ DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask);
+ }
+
+ // What we are doing here is handling the case when the inputs are differently signed zeros.
+ // x86-64 treats differently signed zeros as equal while ARM does not.
+ // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero.
+
+ // vrangep{s,d} here ends up not being significantly shorter than the AVX implementation
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpeqp)(mask, result, xmm_b);
+ if constexpr (is_max) {
+ FCODE(vandp)(eq, result, xmm_b);
+ FCODE(vmaxp)(result, result, xmm_b);
+ } else {
+ FCODE(vorp)(eq, result, xmm_b);
+ FCODE(vminp)(result, result, xmm_b);
+ }
+ FCODE(blendvp)(result, eq);
+ } else {
+ code.movaps(mask, result);
+ code.movaps(eq, result);
+ FCODE(cmpneqp)(mask, xmm_b);
+
+ if constexpr (is_max) {
+ code.andps(eq, xmm_b);
+ FCODE(maxp)(result, xmm_b);
+ } else {
+ code.orps(eq, xmm_b);
+ FCODE(minp)(result, xmm_b);
+ }
+
+ code.andps(result, mask);
+ code.andnps(mask, eq);
+ code.orps(result, mask);
+ }
+ },
+ CheckInputNaN::Yes);
+}
+
+template<size_t fsize, bool is_max>
+static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ const bool fpcr_controlled = inst->GetArg(2).GetU1();
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm();
+
+ const Xbyak::Xmm tmp1 = xmm0;
+ const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm();
+
+ // NaN requirements:
+ // op1 op2 result
+ // SNaN anything op1
+ // !SNaN SNaN op2
+ // QNaN !NaN op2
+ // !NaN QNaN op1
+ // QNaN QNaN op1
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ // result = xmm_a == SNaN || xmm_b == QNaN
+ {
+ // evaluate xmm_b == QNaN
+ FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b);
+ ICODE(vpsll)(tmp2, xmm_b, static_cast<u8>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
+ {
+ code.vpsrad(tmp2, tmp2, 31);
+ if constexpr (fsize == 64) {
+ code.vpshufd(tmp2, tmp2, 0b11110101);
+ }
+ }
+ code.vandps(result, tmp1, tmp2);
+
+ // evaluate xmm_a == SNaN
+ FCODE(vcmpunordp)(tmp1, xmm_a, xmm_a);
+ ICODE(vpsll)(tmp2, xmm_a, static_cast<u8>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
+ {
+ code.vpsrad(tmp2, tmp2, 31);
+ if constexpr (fsize == 64) {
+ code.vpshufd(tmp2, tmp2, 0b11110101);
+ }
+ }
+ code.vandnps(tmp2, tmp2, tmp1);
+
+ code.vorps(result, tmp2);
+ }
+
+ // Denormalization quiets SNaNs, therefore should happen after SNaN detection!
+ DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1);
+
+ // intermediate result = max/min(xmm_a, xmm_b)
+ {
+ const Xbyak::Xmm eq_mask = tmp1;
+ const Xbyak::Xmm eq = tmp2;
+
+ FCODE(vcmpeqp)(eq_mask, xmm_a, xmm_b);
+
+ if constexpr (is_max) {
+ code.vandps(eq, xmm_a, xmm_b);
+ FCODE(vmaxp)(intermediate_result, xmm_a, xmm_b);
+ } else {
+ code.vorps(eq, xmm_a, xmm_b);
+ FCODE(vminp)(intermediate_result, xmm_a, xmm_b);
+ }
+
+ code.blendvps(intermediate_result, eq); // eq_mask is in xmm0
+ }
+
+ {
+ code.vblendvps(result, intermediate_result, xmm_a, result);
+ }
+
+ if (ctx.FPCR(fpcr_controlled).DN()) {
+ const Xbyak::Xmm ord_mask = tmp1;
+
+ FCODE(vcmpunordp)(ord_mask, result, result);
+ code.blendvps(result, GetNaNVector<fsize>(code)); // ord_mask is in xmm0
+ } else {
+ const Xbyak::Xmm nan_mask = tmp1;
+
+ FCODE(vcmpunordp)(nan_mask, result, result);
+ code.vandps(nan_mask, nan_mask, GetVectorOf<fsize, FP::FPInfo<FPT>::mantissa_msb>(code));
+ code.vorps(result, result, nan_mask);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ // result = xmm_a == SNaN || xmm_b == QNaN
+ {
+ // evaluate xmm_b == QNaN
+ code.xorps(tmp1, tmp1);
+ FCODE(cmpunordp)(tmp1, xmm_b);
+ code.movaps(tmp2, xmm_b);
+ ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
+ {
+ code.psrad(tmp2, 31);
+ if constexpr (fsize == 64) {
+ code.pshufd(tmp2, tmp2, 0b11110101);
+ }
+ }
+ code.andps(tmp1, tmp2);
+
+ code.movaps(result, tmp1);
+
+ // evaluate xmm_a == SNaN
+ code.xorps(tmp1, tmp1);
+ FCODE(cmpunordp)(tmp1, xmm_a);
+ code.movaps(tmp2, xmm_a);
+ ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width));
+ {
+ code.psrad(tmp2, 31);
+ if constexpr (fsize == 64) {
+ code.pshufd(tmp2, tmp2, 0b11110101);
+ }
+ }
+ code.andnps(tmp2, tmp1);
+
+ code.orps(result, tmp2);
+ }
+
+ // Denormalization quiets SNaNs, therefore should happen after SNaN detection!
+ DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1);
+
+ // intermediate result = max/min(xmm_a, xmm_b)
+ {
+ const Xbyak::Xmm eq_mask = tmp1;
+ const Xbyak::Xmm eq = tmp2;
+
+ code.movaps(eq_mask, xmm_a);
+ FCODE(cmpneqp)(eq_mask, xmm_b);
+
+ code.movaps(eq, xmm_a);
+ code.movaps(intermediate_result, xmm_a);
+ if constexpr (is_max) {
+ code.andps(eq, xmm_b);
+ FCODE(maxp)(intermediate_result, xmm_b);
+ } else {
+ code.orps(eq, xmm_b);
+ FCODE(minp)(intermediate_result, xmm_b);
+ }
+
+ code.andps(intermediate_result, eq_mask);
+ code.andnps(eq_mask, eq);
+ code.orps(intermediate_result, eq_mask);
+ }
+
+ {
+ code.andps(xmm_a, result);
+ code.andnps(result, intermediate_result);
+ code.orps(result, xmm_a);
+ }
+
+ if (ctx.FPCR(fpcr_controlled).DN()) {
+ const Xbyak::Xmm ord_mask = tmp1;
+
+ code.xorps(ord_mask, ord_mask);
+ FCODE(cmpordp)(ord_mask, result);
+
+ code.andps(result, ord_mask);
+ code.andnps(ord_mask, GetNaNVector<fsize>(code));
+ code.orps(result, ord_mask);
+ } else {
+ const Xbyak::Xmm nan_mask = tmp1;
+
+ code.xorps(nan_mask, nan_mask);
+ FCODE(cmpunordp)(nan_mask, result);
+ code.andps(nan_mask, GetVectorOf<fsize, FP::FPInfo<FPT>::mantissa_msb>(code));
+ code.orps(result, nan_mask);
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMax<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMax<64, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMax<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMax<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
+}
+
+void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd);
+}
+
+template<typename FPT, bool needs_rounding_correction, bool needs_nan_correction>
+static void EmitFPVectorMulAddFallback(VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, [[maybe_unused]] FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ if constexpr (needs_rounding_correction) {
+ constexpr FPT non_sign_mask = FP::FPInfo<FPT>::exponent_mask | FP::FPInfo<FPT>::mantissa_mask;
+ constexpr FPT smallest_normal_number = FP::FPValue<FPT, false, FP::FPInfo<FPT>::exponent_min, 1>();
+ if ((result[i] & non_sign_mask) == smallest_normal_number) {
+ result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr);
+ continue;
+ }
+ }
+ if constexpr (needs_nan_correction) {
+ if (FP::IsNaN(result[i])) {
+ if (FP::IsQNaN(addend[i]) && ((FP::IsZero(op1[i], fpcr) && FP::IsInf(op2[i])) || (FP::IsInf(op1[i]) && FP::IsZero(op2[i], fpcr)))) {
+ result[i] = FP::FPInfo<FPT>::DefaultNaN();
+ } else if (auto r = FP::ProcessNaNs(addend[i], op1[i], op2[i])) {
+ result[i] = *r;
+ } else {
+ result[i] = FP::FPInfo<FPT>::DefaultNaN();
+ }
+ }
+ }
+ }
+}
+
+template<size_t fsize>
+void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr);
+ }
+ };
+
+ if constexpr (fsize != 16) {
+ const bool fpcr_controlled = inst->GetArg(3).GetU1();
+ const bool needs_rounding_correction = ctx.FPCR(fpcr_controlled).FZ();
+ const bool needs_nan_correction = !(ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN));
+
+ if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ FCODE(vfmadd231p)(result, xmm_b, xmm_c);
+ ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.movaps(result, xmm_a);
+ FCODE(vfmadd231p)(result, xmm_b, xmm_c);
+
+ if (needs_rounding_correction && needs_nan_correction) {
+ code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code));
+ FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
+ } else if (needs_rounding_correction) {
+ code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code));
+ ICODE(vpcmpeq)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
+ } else if (needs_nan_correction) {
+ FCODE(vcmpunordp)(tmp, result, result);
+ }
+ code.vptest(tmp, tmp);
+ code.jnz(*fallback, code.T_NEAR);
+ code.L(*end);
+ ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+ });
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ if (needs_rounding_correction && needs_nan_correction) {
+ EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, true>, fpcr_controlled);
+ } else if (needs_rounding_correction) {
+ EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, false>, fpcr_controlled);
+ } else if (needs_nan_correction) {
+ EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, false, true>, fpcr_controlled);
+ }
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
+ const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
+
+ FCODE(mulp)(operand2, operand3);
+ FCODE(addp)(operand1, operand2);
+
+ ctx.reg_alloc.DefineValue(inst, operand1);
+ return;
+ }
+ }
+
+ EmitFourOpFallback(code, ctx, inst, fallback_fn);
+}
+
+void EmitX64::EmitFPVectorMulAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMulAdd<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMulAdd<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMulAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMulAdd<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ if (ctx.FPCR(fpcr_controlled).DN() && code.HasHostFeature(HostFeature::AVX)) {
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm twos = ctx.reg_alloc.ScratchXmm();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ FCODE(vcmpunordp)(xmm0, result, operand);
+ FCODE(vxorp)(twos, result, operand);
+ FCODE(mulp)(result, operand);
+ FCODE(andp)(twos, GetNegativeZeroVector<fsize>(code));
+ FCODE(vcmpunordp)(tmp, result, result);
+ FCODE(blendvp)(result, GetNaNVector<fsize>(code));
+ FCODE(orp)(twos, GetVectorOf<fsize, false, 0, 2>(code));
+ FCODE(andnp)(xmm0, tmp);
+ FCODE(blendvp)(result, twos);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(nan_mask, xmm_b);
+ code.movaps(result, xmm_a);
+ FCODE(cmpunordp)(nan_mask, xmm_a);
+ FCODE(mulp)(result, xmm_b);
+ FCODE(cmpunordp)(nan_mask, result);
+
+ const auto nan_handler = Common::FptrCast(
+ [](std::array<VectorArray<FPT>, 3>& values, FP::FPCR fpcr) {
+ VectorArray<FPT>& result = values[0];
+ for (size_t elementi = 0; elementi < result.size(); ++elementi) {
+ if (auto r = FP::ProcessNaNs(values[1][elementi], values[2][elementi])) {
+ result[elementi] = fpcr.DN() ? FP::FPInfo<FPT>::DefaultNaN() : *r;
+ } else if (FP::IsNaN(result[elementi])) {
+ const FPT sign = (values[1][elementi] ^ values[2][elementi]) & FP::FPInfo<FPT>::sign_mask;
+ result[elementi] = sign | FP::FPValue<FPT, false, 0, 2>();
+ }
+ }
+ });
+
+ HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPVectorMulX32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMulX<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMulX64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorMulX<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+ constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Address mask = code.BConst<fsize>(xword, sign_mask);
+
+ code.xorps(a, mask);
+
+ ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorNeg<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorNeg32(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorNeg<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorNeg64(EmitContext& ctx, IR::Inst* inst) {
+ FPVectorNeg<64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, PairedIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::haddps);
+}
+
+void EmitX64::EmitFPVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, PairedIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::haddpd);
+}
+
+void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) {
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+ code.xorps(zero, zero);
+ code.punpcklqdq(result, xmm_b);
+ code.haddps(result, zero);
+ });
+}
+
+void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) {
+ const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm();
+ code.xorps(zero, zero);
+ code.punpcklqdq(result, xmm_b);
+ code.haddpd(result, zero);
+ });
+}
+
+template<size_t fsize>
+static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ if constexpr (fsize != 16) {
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ FCODE(vrcp14p)(result, operand);
+ } else {
+ if constexpr (fsize == 32) {
+ code.rcpps(result, operand);
+ } else {
+ code.cvtpd2ps(result, operand);
+ code.rcpps(result, result);
+ code.cvtps2pd(result, result);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPRecipEstimate<FPT>(operand[i], fpcr, fpsr);
+ }
+ });
+}
+
+void EmitX64::EmitFPVectorRecipEstimate16(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipEstimate<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRecipEstimate32(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipEstimate<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipEstimate<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPRecipStepFused<FPT>(op1[i], op2[i], fpcr, fpsr);
+ }
+ };
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
+ FCODE(vfnmadd231p)(result, operand1, operand2);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
+ FCODE(vfnmadd231p)(result, operand1, operand2);
+
+ FCODE(vcmpunordp)(tmp, result, result);
+ code.vptest(tmp, tmp);
+ code.jnz(*fallback, code.T_NEAR);
+ code.L(*end);
+ });
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
+ FCODE(mulp)(operand1, operand2);
+ FCODE(subp)(result, operand1);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ EmitThreeOpFallback(code, ctx, inst, fallback_fn);
+}
+
+void EmitX64::EmitFPVectorRecipStepFused16(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipStepFused<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRecipStepFused32(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipStepFused<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
+ EmitRecipStepFused<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool exact = inst->GetArg(2).GetU1();
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
+ const u8 round_imm = [&]() -> u8 {
+ switch (rounding) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ return 0b00;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ return 0b10;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ return 0b01;
+ case FP::RoundingMode::TowardsZero:
+ return 0b11;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ EmitTwoOpVectorOperation<fsize, DefaultIndexer, 3>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_a) {
+ FCODE(roundp)(result, xmm_a, round_imm);
+ });
+
+ return;
+ }
+ }
+
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+ using exact_list = mp::list<std::true_type, std::false_type>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
+ constexpr bool exact = mp::get<1, I>::value;
+
+ for (size_t i = 0; i < output.size(); ++i) {
+ output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
+ }
+ })};
+ },
+ mp::cartesian_product<rounding_list, exact_list>{});
+
+ EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact)));
+}
+
+void EmitX64::EmitFPVectorRoundInt16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorRoundInt<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRoundInt32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorRoundInt<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRoundInt64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorRoundInt<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPRSqrtEstimate<FPT>(operand[i], fpcr, fpsr);
+ }
+ };
+
+ if constexpr (fsize != 16) {
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ FCODE(vrsqrt14p)(result, operand);
+ } else {
+ if constexpr (fsize == 32) {
+ code.rsqrtps(result, operand);
+ } else {
+ code.cvtpd2ps(result, operand);
+ code.rsqrtps(result, result);
+ code.cvtps2pd(result, result);
+ }
+ }
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[1].GetImmediateU1();
+
+ const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm();
+
+ SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+
+ code.movaps(value, operand);
+
+ code.movaps(xmm0, GetVectorOf<fsize, (fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000)>(code));
+ code.pand(value, xmm0);
+ code.por(value, GetVectorOf<fsize, (fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000)>(code));
+
+ // Detect NaNs, negatives, zeros, denormals and infinities
+ FCODE(vcmpnge_uqp)(result, value, GetVectorOf<fsize, (FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width)>(code));
+ code.vptest(result, result);
+ code.jnz(*bad_values, code.T_NEAR);
+
+ FCODE(sqrtp)(value, value);
+ code.vmovaps(result, GetVectorOf<fsize, FP::FPValue<FPT, false, 0, 1>()>(code));
+ FCODE(divp)(result, value);
+
+ ICODE(padd)(result, GetVectorOf<fsize, (fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000)>(code));
+ code.pand(result, xmm0);
+
+ code.L(*end);
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*bad_values);
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ EmitTwoOpFallbackWithoutRegAlloc(code, ctx, result, operand, fallback_fn, fpcr_controlled);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ EmitTwoOpFallback(code, ctx, inst, fallback_fn);
+}
+
+void EmitX64::EmitFPVectorRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtEstimate<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtEstimate<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtEstimate<64>(code, ctx, inst);
+}
+
+template<size_t fsize>
+static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>;
+
+ const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ for (size_t i = 0; i < result.size(); i++) {
+ result[i] = FP::FPRSqrtStepFused<FPT>(op1[i], op2[i], fpcr, fpsr);
+ }
+ };
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
+ FCODE(vfnmadd231p)(result, operand1, operand2);
+ FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const bool fpcr_controlled = args[2].GetImmediateU1();
+
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
+
+ SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
+ FCODE(vfnmadd231p)(result, operand1, operand2);
+
+ // An explanation for this is given in EmitFPRSqrtStepFused.
+ code.vmovaps(mask, GetVectorOf<fsize, (fsize == 32 ? 0x7f000000 : 0x7fe0000000000000)>(code));
+ FCODE(vandp)(tmp, result, mask);
+ ICODE(vpcmpeq)(tmp, tmp, mask);
+ code.ptest(tmp, tmp);
+ code.jnz(*fallback, code.T_NEAR);
+
+ FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
+ code.L(*end);
+ });
+
+ ctx.deferred_emits.emplace_back([=, &code, &ctx] {
+ code.L(*fallback);
+ code.sub(rsp, 8);
+ ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled);
+ ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
+ code.add(rsp, 8);
+ code.jmp(*end, code.T_NEAR);
+ });
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+ code.movaps(result, GetVectorOf<fsize, false, 0, 3>(code));
+ FCODE(mulp)(operand1, operand2);
+ FCODE(subp)(result, operand1);
+ FCODE(mulp)(result, GetVectorOf<fsize, false, -1, 1>(code));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+ }
+
+ EmitThreeOpFallback(code, ctx, inst, fallback_fn);
+}
+
+void EmitX64::EmitFPVectorRSqrtStepFused16(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtStepFused<16>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRSqrtStepFused32(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtStepFused<32>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorRSqrtStepFused64(EmitContext& ctx, IR::Inst* inst) {
+ EmitRSqrtStepFused<64>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorSqrt32(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, [this](const Xbyak::Xmm& result, const Xbyak::Xmm& operand) {
+ code.sqrtps(result, operand);
+ });
+}
+
+void EmitX64::EmitFPVectorSqrt64(EmitContext& ctx, IR::Inst* inst) {
+ EmitTwoOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, [this](const Xbyak::Xmm& result, const Xbyak::Xmm& operand) {
+ code.sqrtpd(result, operand);
+ });
+}
+
+void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subps);
+}
+
+void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
+}
+
+void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
+ const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
+ const bool fpcr_controlled = inst->GetArg(2).GetU1();
+
+ if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
+ code.vcvtps2ph(result, result, static_cast<u8>(*round_imm));
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](VectorArray<u16>& output, const VectorArray<u32>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
+
+ for (size_t i = 0; i < output.size(); ++i) {
+ if (i < input.size()) {
+ output[i] = FP::FPConvert<u16, u32>(input[i], fpcr, rounding_mode, fpsr);
+ } else {
+ output[i] = 0;
+ }
+ }
+ })};
+ },
+ mp::cartesian_product<rounding_list>{});
+
+ EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
+}
+
+template<size_t fsize, bool unsigned_>
+void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ const size_t fbits = inst->GetArg(1).GetU8();
+ const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
+ [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
+
+ if constexpr (fsize != 16) {
+ if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]);
+
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ const int round_imm = [&] {
+ switch (rounding) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ default:
+ return 0b00;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ return 0b10;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ return 0b01;
+ case FP::RoundingMode::TowardsZero:
+ return 0b11;
+ }
+ }();
+
+ const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
+ // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
+ (void)ctx;
+
+ if constexpr (fsize == 32) {
+ code.cvttps2dq(src, src);
+ } else {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ code.vcvttpd2qq(src, src);
+ } else {
+ const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
+ const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
+
+ code.cvttsd2si(lo, src);
+ code.punpckhqdq(src, src);
+ code.cvttsd2si(hi, src);
+ code.movq(src, lo);
+ code.pinsrq(src, hi, 1);
+
+ ctx.reg_alloc.Release(hi);
+ ctx.reg_alloc.Release(lo);
+ }
+ }
+ };
+
+ if (fbits != 0) {
+ const u64 scale_factor = fsize == 32
+ ? static_cast<u64>(fbits + 127) << 23
+ : static_cast<u64>(fbits + 1023) << 52;
+ FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor));
+ }
+
+ FCODE(roundp)(src, src, static_cast<u8>(round_imm));
+ ZeroIfNaN<fsize>(code, src);
+
+ constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
+ [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
+
+ if constexpr (unsigned_) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ // Mask positive values
+ code.xorps(xmm0, xmm0);
+ FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
+
+ // Convert positive values to unsigned integers, write 0 anywhere else
+ // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
+ if constexpr (fsize == 32) {
+ code.vcvttps2udq(src | k1 | T_z, src);
+ } else {
+ code.vcvttpd2uqq(src | k1 | T_z, src);
+ }
+ } else {
+ // Zero is minimum
+ code.xorps(xmm0, xmm0);
+ FCODE(cmplep)(xmm0, src);
+ FCODE(andp)(src, xmm0);
+
+ // Will we exceed unsigned range?
+ const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
+ code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
+ FCODE(cmplep)(exceed_unsigned, src);
+
+ // Will be exceed signed range?
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
+ code.movaps(xmm0, tmp);
+ FCODE(cmplep)(xmm0, src);
+ FCODE(andp)(tmp, xmm0);
+ FCODE(subp)(src, tmp);
+ perform_conversion(src);
+ ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
+ FCODE(orp)(src, xmm0);
+
+ // Saturate to max
+ FCODE(orp)(src, exceed_unsigned);
+ }
+ } else {
+ using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
+ constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());
+
+ code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
+ FCODE(cmplep)(xmm0, src);
+ perform_conversion(src);
+ FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code));
+ }
+ });
+
+ ctx.reg_alloc.DefineValue(inst, src);
+ return;
+ }
+ }
+
+ using fbits_list = mp::lift_sequence<std::make_index_sequence<fsize + 1>>;
+ using rounding_list = mp::list<
+ mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
+ mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
+ mp::lift_value<FP::RoundingMode::TowardsZero>,
+ mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
+
+ static const auto lut = Common::GenerateLookupTableFromList(
+ []<typename I>(I) {
+ using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
+ return std::pair{
+ mp::lower_to_tuple_v<I>,
+ Common::FptrCast(
+ [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
+ constexpr size_t fbits = mp::get<0, I>::value;
+ constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
+
+ for (size_t i = 0; i < output.size(); ++i) {
+ output[i] = static_cast<FPT>(FP::FPToFixed<FPT>(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr));
+ }
+ })};
+ },
+ mp::cartesian_product<fbits_list, rounding_list>{});
+
+ EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(fbits, rounding)));
+}
+
+void EmitX64::EmitFPVectorToSignedFixed16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<16, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorToSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorToSignedFixed64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<64, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorToUnsignedFixed16(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<16, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorToUnsignedFixed32(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorToUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) {
+ EmitFPVectorToFixed<64, true>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp
new file mode 100644
index 0000000000..fb30549fb0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp
@@ -0,0 +1,340 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/constants.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+
+#define FCODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (esize == 32) { \
+ code.NAME##s(args...); \
+ } else { \
+ code.NAME##d(args...); \
+ } \
+ }
+
+#define ICODE(NAME) \
+ [&code](auto... args) { \
+ if constexpr (esize == 32) { \
+ code.NAME##d(args...); \
+ } else { \
+ code.NAME##q(args...); \
+ } \
+ }
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+namespace {
+
+void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*saturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*unsaturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*sub_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
+
+ code.movaps(xmm0, result);
+
+ (code.*saturated_fn)(result, addend);
+
+ (code.*unsaturated_fn)(xmm0, addend);
+ (code.*sub_fn)(xmm0, result);
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.ptest(xmm0, xmm0);
+ } else {
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+ code.pxor(tmp, tmp);
+ code.pcmpeqw(xmm0, tmp);
+ code.pmovmskb(overflow.cvt32(), xmm0);
+ code.xor_(overflow.cvt32(), 0xFFFF);
+ code.test(overflow.cvt32(), overflow.cvt32());
+ }
+ code.setnz(overflow);
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+}
+
+enum class Op {
+ Add,
+ Sub,
+};
+
+template<Op op, size_t esize>
+void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ static_assert(esize == 32 || esize == 64);
+ constexpr u64 msb_mask = esize == 32 ? 0x8000000080000000 : 0x8000000000000000;
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
+
+ code.movaps(xmm0, operand1);
+
+ if constexpr (op == Op::Add) {
+ ICODE(vpadd)(result, operand1, operand2);
+ code.vpternlogd(xmm0, result, operand2, 0b00100100);
+ } else {
+ ICODE(vpsub)(result, operand1, operand2);
+ code.vpternlogd(xmm0, result, operand2, 0b00011000);
+ }
+ if constexpr (esize == 32) {
+ code.vpmovd2m(k1, xmm0);
+ } else {
+ code.vpmovq2m(k1, xmm0);
+ }
+ ICODE(vpsra)(result | k1, result, u8(esize - 1));
+ ICODE(vpxor)(result | k1, result, code.BConst<esize>(xword_b, msb_mask));
+
+ code.ktestb(k1, k1);
+ code.setnz(overflow);
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1;
+ const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ if constexpr (op == Op::Add) {
+ ICODE(vpadd)(result, operand1, operand2);
+ } else {
+ ICODE(vpsub)(result, operand1, operand2);
+ }
+ code.vpxor(xmm0, operand1, operand2);
+ code.vpxor(tmp, operand1, result);
+ } else {
+ code.movaps(xmm0, operand1);
+ code.movaps(tmp, operand1);
+ if constexpr (op == Op::Add) {
+ ICODE(padd)(result, operand2);
+ } else {
+ ICODE(psub)(result, operand2);
+ }
+ code.pxor(xmm0, operand2);
+ code.pxor(tmp, result);
+ }
+
+ if constexpr (op == Op::Add) {
+ code.pandn(xmm0, tmp);
+ } else {
+ code.pand(xmm0, tmp);
+ }
+
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpsrad(tmp, result, 31);
+ } else {
+ code.movaps(tmp, result);
+ code.psrad(tmp, 31);
+ }
+ if constexpr (esize == 64) {
+ code.pshufd(tmp, tmp, 0b11110101);
+ }
+ code.pxor(tmp, code.BConst<esize>(xword, msb_mask));
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.ptest(xmm0, code.Const(xword, msb_mask, msb_mask));
+ } else {
+ FCODE(movmskp)(overflow.cvt32(), xmm0);
+ code.test(overflow.cvt32(), overflow.cvt32());
+ }
+ code.setnz(overflow);
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ FCODE(blendvp)(result, tmp);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ code.psrad(xmm0, 31);
+ if constexpr (esize == 64) {
+ code.pshufd(xmm0, xmm0, 0b11110101);
+ }
+
+ code.pand(tmp, xmm0);
+ code.pandn(xmm0, result);
+ code.por(tmp, xmm0);
+
+ ctx.reg_alloc.DefineValue(inst, tmp);
+ }
+}
+
+template<Op op, size_t esize>
+void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+ static_assert(esize == 32 || esize == 64);
+
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+ const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+ const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
+
+ if constexpr (op == Op::Add) {
+ ICODE(vpadd)(result, operand1, operand2);
+ ICODE(vpcmpu)(k1, result, operand1, CmpInt::LessThan);
+ ICODE(vpternlog)(result | k1, result, result, u8(0xFF));
+ } else {
+ ICODE(vpsub)(result, operand1, operand2);
+ ICODE(vpcmpu)(k1, result, operand1, CmpInt::GreaterThan);
+ ICODE(vpxor)(result | k1, result, result);
+ }
+
+ code.ktestb(k1, k1);
+ code.setnz(overflow);
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
+
+ ctx.reg_alloc.DefineValue(inst, result);
+ return;
+ }
+
+ const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]);
+ const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+ const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1;
+ const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8();
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+ if constexpr (op == Op::Add) {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpxor(xmm0, operand1, operand2);
+ code.vpand(tmp, operand1, operand2);
+ ICODE(vpadd)(result, operand1, operand2);
+ } else {
+ code.movaps(tmp, operand1);
+ code.movaps(xmm0, operand1);
+
+ code.pxor(xmm0, operand2);
+ code.pand(tmp, operand2);
+ ICODE(padd)(result, operand2);
+ }
+
+ ICODE(psrl)(xmm0, 1);
+ ICODE(padd)(tmp, xmm0);
+ } else {
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ code.vpxor(tmp, operand1, operand2);
+ ICODE(vpsub)(result, operand1, operand2);
+ code.vpand(xmm0, operand2, tmp);
+ } else {
+ code.movaps(tmp, operand1);
+ code.movaps(xmm0, operand2);
+
+ code.pxor(tmp, operand2);
+ ICODE(psub)(result, operand2);
+ code.pand(xmm0, tmp);
+ }
+
+ ICODE(psrl)(tmp, 1);
+ ICODE(psub)(tmp, xmm0);
+ }
+
+ code.psrad(tmp, 31);
+ if constexpr (esize == 64) {
+ code.pshufd(tmp, tmp, 0b11110101);
+ }
+
+ if (code.HasHostFeature(HostFeature::SSE41)) {
+ code.ptest(tmp, tmp);
+ } else {
+ FCODE(movmskp)(overflow.cvt32(), tmp);
+ code.test(overflow.cvt32(), overflow.cvt32());
+ }
+
+ code.setnz(overflow);
+ code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow);
+
+ if constexpr (op == Op::Add) {
+ code.por(result, tmp);
+ ctx.reg_alloc.DefineValue(inst, result);
+ } else {
+ code.pandn(tmp, result);
+ ctx.reg_alloc.DefineValue(inst, tmp);
+ }
+}
+
+} // anonymous namespace
+
+void EmitX64::EmitVectorSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturated<Op::Add, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturated<Op::Add, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb);
+}
+
+void EmitX64::EmitVectorSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw);
+}
+
+void EmitX64::EmitVectorSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturated<Op::Sub, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSignedSaturated<Op::Sub, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedSaturated<Op::Add, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedSaturated<Op::Add, 64>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedSaturated<Op::Sub, 32>(code, ctx, inst);
+}
+
+void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) {
+ EmitVectorUnsignedSaturated<Op::Sub, 64>(code, ctx, inst);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp b/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
new file mode 100644
index 0000000000..a7f964337a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
@@ -0,0 +1,264 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#include <cstring>
+#include <vector>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/exception_handler.h"
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/common/safe_ops.h"
+
+using UBYTE = u8;
+
+enum UNWIND_REGISTER_CODES {
+ UWRC_RAX,
+ UWRC_RCX,
+ UWRC_RDX,
+ UWRC_RBX,
+ UWRC_RSP,
+ UWRC_RBP,
+ UWRC_RSI,
+ UWRC_RDI,
+ UWRC_R8,
+ UWRC_R9,
+ UWRC_R10,
+ UWRC_R11,
+ UWRC_R12,
+ UWRC_R13,
+ UWRC_R14,
+ UWRC_R15,
+};
+
+enum UNWIND_OPCODE {
+ UWOP_PUSH_NONVOL = 0,
+ UWOP_ALLOC_LARGE = 1,
+ UWOP_ALLOC_SMALL = 2,
+ UWOP_SET_FPREG = 3,
+ UWOP_SAVE_NONVOL = 4,
+ UWOP_SAVE_NONVOL_FAR = 5,
+ UWOP_SAVE_XMM128 = 8,
+ UWOP_SAVE_XMM128_FAR = 9,
+ UWOP_PUSH_MACHFRAME = 10,
+};
+
+union UNWIND_CODE {
+ struct {
+ UBYTE CodeOffset;
+ UBYTE UnwindOp : 4;
+ UBYTE OpInfo : 4;
+ } code;
+ USHORT FrameOffset;
+};
+
+// UNWIND_INFO is a tail-padded structure
+struct UNWIND_INFO {
+ UBYTE Version : 3;
+ UBYTE Flags : 5;
+ UBYTE SizeOfProlog;
+ UBYTE CountOfCodes;
+ UBYTE FrameRegister : 4;
+ UBYTE FrameOffset : 4;
+ // UNWIND_CODE UnwindCode[];
+ // With Flags == 0 there are no additional fields.
+ // OPTIONAL UNW_EXCEPTION_INFO ExceptionInfo;
+};
+
+struct UNW_EXCEPTION_INFO {
+ ULONG ExceptionHandler;
+ // OPTIONAL ARBITRARY HandlerData;
+};
+
+namespace Dynarmic::Backend {
+
+using namespace Dynarmic::Backend::X64;
+
+struct PrologueInformation {
+ std::vector<UNWIND_CODE> unwind_code;
+ size_t number_of_unwind_code_entries;
+ u8 prolog_size;
+};
+
+static PrologueInformation GetPrologueInformation() {
+ PrologueInformation ret;
+
+ const auto next_entry = [&]() -> UNWIND_CODE& {
+ ret.unwind_code.emplace_back();
+ return ret.unwind_code.back();
+ };
+ const auto push_nonvol = [&](u8 offset, UNWIND_REGISTER_CODES reg) {
+ auto& entry = next_entry();
+ entry.code.CodeOffset = offset;
+ entry.code.UnwindOp = UWOP_PUSH_NONVOL;
+ entry.code.OpInfo = reg;
+ };
+ const auto alloc_large = [&](u8 offset, size_t size) {
+ ASSERT(size % 8 == 0);
+ size /= 8;
+
+ auto& entry = next_entry();
+ entry.code.CodeOffset = offset;
+ entry.code.UnwindOp = UWOP_ALLOC_LARGE;
+ if (size <= 0xFFFF) {
+ entry.code.OpInfo = 0;
+ auto& size_entry = next_entry();
+ size_entry.FrameOffset = static_cast<USHORT>(size);
+ } else {
+ entry.code.OpInfo = 1;
+ auto& size_entry_1 = next_entry();
+ size_entry_1.FrameOffset = static_cast<USHORT>(size);
+ auto& size_entry_2 = next_entry();
+ size_entry_2.FrameOffset = static_cast<USHORT>(size >> 16);
+ }
+ };
+ const auto save_xmm128 = [&](u8 offset, u8 reg, size_t frame_offset) {
+ ASSERT(frame_offset % 16 == 0);
+
+ auto& entry = next_entry();
+ entry.code.CodeOffset = offset;
+ entry.code.UnwindOp = UWOP_SAVE_XMM128;
+ entry.code.OpInfo = reg;
+ auto& offset_entry = next_entry();
+ offset_entry.FrameOffset = static_cast<USHORT>(frame_offset / 16);
+ };
+
+ // This is a list of operations that occur in the prologue.
+ // The debugger uses this information to retrieve register values and
+ // to calculate the size of the stack frame.
+ ret.prolog_size = 89;
+ save_xmm128(89, 15, 0xB0); // +050 44 0F 29 BC 24 B0 00 00 00 movaps xmmword ptr [rsp+0B0h],xmm15
+ save_xmm128(80, 14, 0xA0); // +047 44 0F 29 B4 24 A0 00 00 00 movaps xmmword ptr [rsp+0A0h],xmm14
+ save_xmm128(71, 13, 0x90); // +03E 44 0F 29 AC 24 90 00 00 00 movaps xmmword ptr [rsp+90h],xmm13
+ save_xmm128(62, 12, 0x80); // +035 44 0F 29 A4 24 80 00 00 00 movaps xmmword ptr [rsp+80h],xmm12
+ save_xmm128(53, 11, 0x70); // +02F 44 0F 29 5C 24 70 movaps xmmword ptr [rsp+70h],xmm11
+ save_xmm128(47, 10, 0x60); // +029 44 0F 29 54 24 60 movaps xmmword ptr [rsp+60h],xmm10
+ save_xmm128(41, 9, 0x50); // +023 44 0F 29 4C 24 50 movaps xmmword ptr [rsp+50h],xmm9
+ save_xmm128(35, 8, 0x40); // +01D 44 0F 29 44 24 40 movaps xmmword ptr [rsp+40h],xmm8
+ save_xmm128(29, 7, 0x30); // +018 0F 29 7C 24 30 movaps xmmword ptr [rsp+30h],xmm7
+ save_xmm128(24, 6, 0x20); // +013 0F 29 74 24 20 movaps xmmword ptr [rsp+20h],xmm6
+ alloc_large(19, 0xC8); // +00C 48 81 EC C8 00 00 00 sub rsp,0C8h
+ push_nonvol(12, UWRC_R15); // +00A 41 57 push r15
+ push_nonvol(10, UWRC_R14); // +008 41 56 push r14
+ push_nonvol(8, UWRC_R13); // +006 41 55 push r13
+ push_nonvol(6, UWRC_R12); // +004 41 54 push r12
+ push_nonvol(4, UWRC_RBP); // +003 55 push rbp
+ push_nonvol(3, UWRC_RDI); // +002 57 push rdi
+ push_nonvol(2, UWRC_RSI); // +001 56 push rsi
+ push_nonvol(1, UWRC_RBX); // +000 53 push rbx
+
+ ret.number_of_unwind_code_entries = ret.unwind_code.size();
+
+ // The Windows API requires the size of the unwind_code array
+ // to be a multiple of two for alignment reasons.
+ if (ret.unwind_code.size() % 2 == 1) {
+ auto& last_entry = next_entry();
+ last_entry.FrameOffset = 0;
+ }
+ ASSERT(ret.unwind_code.size() % 2 == 0);
+
+ return ret;
+}
+
+struct ExceptionHandler::Impl final {
+ Impl(BlockOfCode& code) {
+ const auto prolog_info = GetPrologueInformation();
+
+ code.align(16);
+ const u8* exception_handler_without_cb = code.getCurr<u8*>();
+ code.mov(code.eax, static_cast<u32>(ExceptionContinueSearch));
+ code.ret();
+
+ code.align(16);
+ const u8* exception_handler_with_cb = code.getCurr<u8*>();
+ // Our 3rd argument is a PCONTEXT.
+
+ // If not within our codeblock, ignore this exception.
+ code.mov(code.rax, Safe::Negate(mcl::bit_cast<u64>(code.getCode())));
+ code.add(code.rax, code.qword[code.ABI_PARAM3 + Xbyak::RegExp(offsetof(CONTEXT, Rip))]);
+ code.cmp(code.rax, static_cast<u32>(code.GetTotalCodeSize()));
+ code.ja(exception_handler_without_cb);
+
+ code.sub(code.rsp, 8);
+ code.mov(code.ABI_PARAM1, mcl::bit_cast<u64>(&cb));
+ code.mov(code.ABI_PARAM2, code.ABI_PARAM3);
+ code.CallLambda(
+ [](const std::function<FakeCall(u64)>& cb_, PCONTEXT ctx) {
+ FakeCall fc = cb_(ctx->Rip);
+
+ ctx->Rsp -= sizeof(u64);
+ *mcl::bit_cast<u64*>(ctx->Rsp) = fc.ret_rip;
+ ctx->Rip = fc.call_rip;
+ });
+ code.add(code.rsp, 8);
+ code.mov(code.eax, static_cast<u32>(ExceptionContinueExecution));
+ code.ret();
+
+ exception_handler_without_cb_offset = static_cast<ULONG>(exception_handler_without_cb - code.getCode<u8*>());
+ exception_handler_with_cb_offset = static_cast<ULONG>(exception_handler_with_cb - code.getCode<u8*>());
+
+ code.align(16);
+ UNWIND_INFO* unwind_info = static_cast<UNWIND_INFO*>(code.AllocateFromCodeSpace(sizeof(UNWIND_INFO)));
+ unwind_info->Version = 1;
+ unwind_info->Flags = UNW_FLAG_EHANDLER;
+ unwind_info->SizeOfProlog = prolog_info.prolog_size;
+ unwind_info->CountOfCodes = static_cast<UBYTE>(prolog_info.number_of_unwind_code_entries);
+ unwind_info->FrameRegister = 0; // No frame register present
+ unwind_info->FrameOffset = 0; // Unused because FrameRegister == 0
+ // UNWIND_INFO::UnwindCode field:
+ const size_t size_of_unwind_code = sizeof(UNWIND_CODE) * prolog_info.unwind_code.size();
+ UNWIND_CODE* unwind_code = static_cast<UNWIND_CODE*>(code.AllocateFromCodeSpace(size_of_unwind_code));
+ memcpy(unwind_code, prolog_info.unwind_code.data(), size_of_unwind_code);
+ // UNWIND_INFO::ExceptionInfo field:
+ except_info = static_cast<UNW_EXCEPTION_INFO*>(code.AllocateFromCodeSpace(sizeof(UNW_EXCEPTION_INFO)));
+ except_info->ExceptionHandler = exception_handler_without_cb_offset;
+
+ code.align(16);
+ rfuncs = static_cast<RUNTIME_FUNCTION*>(code.AllocateFromCodeSpace(sizeof(RUNTIME_FUNCTION)));
+ rfuncs->BeginAddress = static_cast<DWORD>(0);
+ rfuncs->EndAddress = static_cast<DWORD>(code.GetTotalCodeSize());
+ rfuncs->UnwindData = static_cast<DWORD>(reinterpret_cast<u8*>(unwind_info) - code.getCode());
+
+ RtlAddFunctionTable(rfuncs, 1, reinterpret_cast<DWORD64>(code.getCode()));
+ }
+
+ void SetCallback(std::function<FakeCall(u64)> new_cb) {
+ cb = new_cb;
+ except_info->ExceptionHandler = cb ? exception_handler_with_cb_offset : exception_handler_without_cb_offset;
+ }
+
+ ~Impl() {
+ RtlDeleteFunctionTable(rfuncs);
+ }
+
+private:
+ RUNTIME_FUNCTION* rfuncs;
+ std::function<FakeCall(u64)> cb;
+ UNW_EXCEPTION_INFO* except_info;
+ ULONG exception_handler_without_cb_offset;
+ ULONG exception_handler_with_cb_offset;
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+ExceptionHandler::~ExceptionHandler() = default;
+
+void ExceptionHandler::Register(BlockOfCode& code) {
+ impl = std::make_unique<Impl>(code);
+}
+
+bool ExceptionHandler::SupportsFastmem() const noexcept {
+ return static_cast<bool>(impl);
+}
+
+void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) {
+ impl->SetCallback(cb);
+}
+
+} // namespace Dynarmic::Backend
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp
new file mode 100644
index 0000000000..984b67bb02
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp
@@ -0,0 +1,58 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/interface/exclusive_monitor.h"
+
+#include <algorithm>
+
+#include <mcl/assert.hpp>
+
+namespace Dynarmic {
+
+ExclusiveMonitor::ExclusiveMonitor(size_t processor_count)
+ : exclusive_addresses(processor_count, INVALID_EXCLUSIVE_ADDRESS), exclusive_values(processor_count) {}
+
+size_t ExclusiveMonitor::GetProcessorCount() const {
+ return exclusive_addresses.size();
+}
+
+void ExclusiveMonitor::Lock() {
+ lock.Lock();
+}
+
+void ExclusiveMonitor::Unlock() {
+ lock.Unlock();
+}
+
+bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) {
+ const VAddr masked_address = address & RESERVATION_GRANULE_MASK;
+
+ Lock();
+ if (exclusive_addresses[processor_id] != masked_address) {
+ Unlock();
+ return false;
+ }
+
+ for (VAddr& other_address : exclusive_addresses) {
+ if (other_address == masked_address) {
+ other_address = INVALID_EXCLUSIVE_ADDRESS;
+ }
+ }
+ return true;
+}
+
+void ExclusiveMonitor::Clear() {
+ Lock();
+ std::fill(exclusive_addresses.begin(), exclusive_addresses.end(), INVALID_EXCLUSIVE_ADDRESS);
+ Unlock();
+}
+
+void ExclusiveMonitor::ClearProcessor(size_t processor_id) {
+ Lock();
+ exclusive_addresses[processor_id] = INVALID_EXCLUSIVE_ADDRESS;
+ Unlock();
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h
new file mode 100644
index 0000000000..7f7fa24259
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h
@@ -0,0 +1,28 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/interface/exclusive_monitor.h"
+
+namespace Dynarmic {
+
+inline volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor* monitor) {
+ return &monitor->lock.storage;
+}
+
+inline size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor* monitor) {
+ return monitor->exclusive_addresses.size();
+}
+
+inline VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor* monitor, size_t index) {
+ return monitor->exclusive_addresses.data() + index;
+}
+
+inline Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor* monitor, size_t index) {
+ return monitor->exclusive_values.data() + index;
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h b/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h
new file mode 100644
index 0000000000..8e3b14c7bb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h
@@ -0,0 +1,66 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::X64 {
+
+enum class HostFeature : u64 {
+ SSSE3 = 1ULL << 0,
+ SSE41 = 1ULL << 1,
+ SSE42 = 1ULL << 2,
+ AVX = 1ULL << 3,
+ AVX2 = 1ULL << 4,
+ AVX512F = 1ULL << 5,
+ AVX512CD = 1ULL << 6,
+ AVX512VL = 1ULL << 7,
+ AVX512BW = 1ULL << 8,
+ AVX512DQ = 1ULL << 9,
+ AVX512BITALG = 1ULL << 10,
+ AVX512VBMI = 1ULL << 11,
+ PCLMULQDQ = 1ULL << 12,
+ F16C = 1ULL << 13,
+ FMA = 1ULL << 14,
+ AES = 1ULL << 15,
+ SHA = 1ULL << 16,
+ POPCNT = 1ULL << 17,
+ BMI1 = 1ULL << 18,
+ BMI2 = 1ULL << 19,
+ LZCNT = 1ULL << 20,
+ GFNI = 1ULL << 21,
+
+ // Zen-based BMI2
+ FastBMI2 = 1ULL << 22,
+
+ // Orthographic AVX512 features on 128 and 256 vectors
+ AVX512_Ortho = AVX512F | AVX512VL,
+
+ // Orthographic AVX512 features for both 32-bit and 64-bit floats
+ AVX512_OrthoFloat = AVX512_Ortho | AVX512DQ,
+};
+
+constexpr HostFeature operator~(HostFeature f) {
+ return static_cast<HostFeature>(~static_cast<u64>(f));
+}
+
+constexpr HostFeature operator|(HostFeature f1, HostFeature f2) {
+ return static_cast<HostFeature>(static_cast<u64>(f1) | static_cast<u64>(f2));
+}
+
+constexpr HostFeature operator&(HostFeature f1, HostFeature f2) {
+ return static_cast<HostFeature>(static_cast<u64>(f1) & static_cast<u64>(f2));
+}
+
+constexpr HostFeature operator|=(HostFeature& result, HostFeature f) {
+ return result = (result | f);
+}
+
+constexpr HostFeature operator&=(HostFeature& result, HostFeature f) {
+ return result = (result & f);
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp
new file mode 100644
index 0000000000..b5e2a5f18b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp
@@ -0,0 +1,25 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/hostloc.h"
+
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+
+namespace Dynarmic::Backend::X64 {
+
+Xbyak::Reg64 HostLocToReg64(HostLoc loc) {
+ ASSERT(HostLocIsGPR(loc));
+ return Xbyak::Reg64(static_cast<int>(loc));
+}
+
+Xbyak::Xmm HostLocToXmm(HostLoc loc) {
+ ASSERT(HostLocIsXMM(loc));
+ return Xbyak::Xmm(static_cast<int>(loc) - static_cast<int>(HostLoc::XMM0));
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h
new file mode 100644
index 0000000000..dbf526b77a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h
@@ -0,0 +1,147 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+#pragma once
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+#include <xbyak/xbyak.h>
+
+namespace Dynarmic::Backend::X64 {
+
+enum class HostLoc {
+ // Ordering of the registers is intentional. See also: HostLocToX64.
+ RAX,
+ RCX,
+ RDX,
+ RBX,
+ RSP,
+ RBP,
+ RSI,
+ RDI,
+ R8,
+ R9,
+ R10,
+ R11,
+ R12,
+ R13,
+ R14,
+ R15,
+ XMM0,
+ XMM1,
+ XMM2,
+ XMM3,
+ XMM4,
+ XMM5,
+ XMM6,
+ XMM7,
+ XMM8,
+ XMM9,
+ XMM10,
+ XMM11,
+ XMM12,
+ XMM13,
+ XMM14,
+ XMM15,
+ CF,
+ PF,
+ AF,
+ ZF,
+ SF,
+ OF,
+ FirstSpill,
+};
+
+constexpr size_t NonSpillHostLocCount = static_cast<size_t>(HostLoc::FirstSpill);
+
+inline bool HostLocIsGPR(HostLoc reg) {
+ return reg >= HostLoc::RAX && reg <= HostLoc::R15;
+}
+
+inline bool HostLocIsXMM(HostLoc reg) {
+ return reg >= HostLoc::XMM0 && reg <= HostLoc::XMM15;
+}
+
+inline bool HostLocIsRegister(HostLoc reg) {
+ return HostLocIsGPR(reg) || HostLocIsXMM(reg);
+}
+
+inline bool HostLocIsFlag(HostLoc reg) {
+ return reg >= HostLoc::CF && reg <= HostLoc::OF;
+}
+
+inline HostLoc HostLocRegIdx(int idx) {
+ ASSERT(idx >= 0 && idx <= 15);
+ return static_cast<HostLoc>(idx);
+}
+
+inline HostLoc HostLocXmmIdx(int idx) {
+ ASSERT(idx >= 0 && idx <= 15);
+ return static_cast<HostLoc>(static_cast<size_t>(HostLoc::XMM0) + idx);
+}
+
+inline HostLoc HostLocSpill(size_t i) {
+ return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i);
+}
+
+inline bool HostLocIsSpill(HostLoc reg) {
+ return reg >= HostLoc::FirstSpill;
+}
+
+inline size_t HostLocBitWidth(HostLoc loc) {
+ if (HostLocIsGPR(loc))
+ return 64;
+ if (HostLocIsXMM(loc))
+ return 128;
+ if (HostLocIsSpill(loc))
+ return 128;
+ if (HostLocIsFlag(loc))
+ return 1;
+ UNREACHABLE();
+}
+
+using HostLocList = std::initializer_list<HostLoc>;
+
+// RSP is preserved for function calls
+// R15 contains the JitState pointer
+const HostLocList any_gpr = {
+ HostLoc::RAX,
+ HostLoc::RBX,
+ HostLoc::RCX,
+ HostLoc::RDX,
+ HostLoc::RSI,
+ HostLoc::RDI,
+ HostLoc::RBP,
+ HostLoc::R8,
+ HostLoc::R9,
+ HostLoc::R10,
+ HostLoc::R11,
+ HostLoc::R12,
+ HostLoc::R13,
+ HostLoc::R14,
+};
+
+// XMM0 is reserved for use by instructions that implicitly use it as an argument
+const HostLocList any_xmm = {
+ HostLoc::XMM1,
+ HostLoc::XMM2,
+ HostLoc::XMM3,
+ HostLoc::XMM4,
+ HostLoc::XMM5,
+ HostLoc::XMM6,
+ HostLoc::XMM7,
+ HostLoc::XMM8,
+ HostLoc::XMM9,
+ HostLoc::XMM10,
+ HostLoc::XMM11,
+ HostLoc::XMM12,
+ HostLoc::XMM13,
+ HostLoc::XMM14,
+ HostLoc::XMM15,
+};
+
+Xbyak::Reg64 HostLocToReg64(HostLoc loc);
+Xbyak::Xmm HostLocToXmm(HostLoc loc);
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h b/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h
new file mode 100644
index 0000000000..654b3f0400
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h
@@ -0,0 +1,38 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace Dynarmic::Backend::X64 {
+
+struct JitStateInfo {
+ template<typename JitStateType>
+ JitStateInfo(const JitStateType&)
+ : offsetof_guest_MXCSR(offsetof(JitStateType, guest_MXCSR))
+ , offsetof_asimd_MXCSR(offsetof(JitStateType, asimd_MXCSR))
+ , offsetof_rsb_ptr(offsetof(JitStateType, rsb_ptr))
+ , rsb_ptr_mask(JitStateType::RSBPtrMask)
+ , offsetof_rsb_location_descriptors(offsetof(JitStateType, rsb_location_descriptors))
+ , offsetof_rsb_codeptrs(offsetof(JitStateType, rsb_codeptrs))
+ , offsetof_cpsr_nzcv(offsetof(JitStateType, cpsr_nzcv))
+ , offsetof_fpsr_exc(offsetof(JitStateType, fpsr_exc))
+ , offsetof_fpsr_qc(offsetof(JitStateType, fpsr_qc))
+ , offsetof_halt_reason(offsetof(JitStateType, halt_reason)) {}
+
+ const size_t offsetof_guest_MXCSR;
+ const size_t offsetof_asimd_MXCSR;
+ const size_t offsetof_rsb_ptr;
+ const size_t rsb_ptr_mask;
+ const size_t offsetof_rsb_location_descriptors;
+ const size_t offsetof_rsb_codeptrs;
+ const size_t offsetof_cpsr_nzcv;
+ const size_t offsetof_fpsr_exc;
+ const size_t offsetof_fpsr_qc;
+ const size_t offsetof_halt_reason;
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h b/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h
new file mode 100644
index 0000000000..3a70cf4f0b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h
@@ -0,0 +1,52 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::X64::NZCV {
+
+constexpr u32 arm_mask = 0xF000'0000;
+constexpr u32 x64_mask = 0xC101;
+
+constexpr size_t x64_n_flag_bit = 15;
+constexpr size_t x64_z_flag_bit = 14;
+constexpr size_t x64_c_flag_bit = 8;
+constexpr size_t x64_v_flag_bit = 0;
+
+/// This is a constant used to create the x64 flags format from the ARM format.
+/// NZCV * multiplier: NZCV0NZCV000NZCV
+/// x64_flags format: NZ-----C-------V
+constexpr u32 to_x64_multiplier = 0x1081;
+
+/// This is a constant used to create the ARM format from the x64 flags format.
+constexpr u32 from_x64_multiplier = 0x1021'0000;
+
+inline u32 ToX64(u32 nzcv) {
+ /* Naive implementation:
+ u32 x64_flags = 0;
+ x64_flags |= mcl::bit::get_bit<31>(cpsr) ? 1 << 15 : 0;
+ x64_flags |= mcl::bit::get_bit<30>(cpsr) ? 1 << 14 : 0;
+ x64_flags |= mcl::bit::get_bit<29>(cpsr) ? 1 << 8 : 0;
+ x64_flags |= mcl::bit::get_bit<28>(cpsr) ? 1 : 0;
+ return x64_flags;
+ */
+ return ((nzcv >> 28) * to_x64_multiplier) & x64_mask;
+}
+
+inline u32 FromX64(u32 x64_flags) {
+ /* Naive implementation:
+ u32 nzcv = 0;
+ nzcv |= mcl::bit::get_bit<15>(x64_flags) ? 1 << 31 : 0;
+ nzcv |= mcl::bit::get_bit<14>(x64_flags) ? 1 << 30 : 0;
+ nzcv |= mcl::bit::get_bit<8>(x64_flags) ? 1 << 29 : 0;
+ nzcv |= mcl::bit::get_bit<0>(x64_flags) ? 1 << 28 : 0;
+ return nzcv;
+ */
+ return ((x64_flags & x64_mask) * from_x64_multiplier) & arm_mask;
+}
+
+} // namespace Dynarmic::Backend::X64::NZCV
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/oparg.h b/externals/dynarmic/src/dynarmic/backend/x64/oparg.h
new file mode 100644
index 0000000000..70c60dfb15
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/oparg.h
@@ -0,0 +1,79 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/assert.hpp>
+#include <xbyak/xbyak.h>
+
+namespace Dynarmic::Backend::X64 {
+
+struct OpArg {
+ OpArg()
+ : type(Type::Operand), inner_operand() {}
+ /* implicit */ OpArg(const Xbyak::Address& address)
+ : type(Type::Address), inner_address(address) {}
+ /* implicit */ OpArg(const Xbyak::Reg& reg)
+ : type(Type::Reg), inner_reg(reg) {}
+
+ Xbyak::Operand& operator*() {
+ switch (type) {
+ case Type::Address:
+ return inner_address;
+ case Type::Operand:
+ return inner_operand;
+ case Type::Reg:
+ return inner_reg;
+ }
+ UNREACHABLE();
+ }
+
+ void setBit(int bits) {
+ switch (type) {
+ case Type::Address:
+ inner_address.setBit(bits);
+ return;
+ case Type::Operand:
+ inner_operand.setBit(bits);
+ return;
+ case Type::Reg:
+ switch (bits) {
+ case 8:
+ inner_reg = inner_reg.cvt8();
+ return;
+ case 16:
+ inner_reg = inner_reg.cvt16();
+ return;
+ case 32:
+ inner_reg = inner_reg.cvt32();
+ return;
+ case 64:
+ inner_reg = inner_reg.cvt64();
+ return;
+ default:
+ ASSERT_MSG(false, "Invalid bits");
+ return;
+ }
+ }
+ UNREACHABLE();
+ }
+
+private:
+ enum class Type {
+ Operand,
+ Address,
+ Reg,
+ };
+
+ Type type;
+
+ union {
+ Xbyak::Operand inner_operand;
+ Xbyak::Address inner_address;
+ Xbyak::Reg inner_reg;
+ };
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp
new file mode 100644
index 0000000000..88691dbffc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp
@@ -0,0 +1,94 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/perf_map.h"
+
+#include <cstddef>
+#include <string>
+
+#ifdef __linux__
+
+# include <cstdio>
+# include <cstdlib>
+# include <mutex>
+
+# include <fmt/format.h>
+# include <mcl/stdint.hpp>
+# include <sys/types.h>
+# include <unistd.h>
+
+namespace Dynarmic::Backend::X64 {
+
+namespace {
+std::mutex mutex;
+std::FILE* file = nullptr;
+
+void OpenFile() {
+ const char* perf_dir = std::getenv("PERF_BUILDID_DIR");
+ if (!perf_dir) {
+ file = nullptr;
+ return;
+ }
+
+ const pid_t pid = getpid();
+ const std::string filename = fmt::format("{:s}/perf-{:d}.map", perf_dir, pid);
+
+ file = std::fopen(filename.c_str(), "w");
+ if (!file) {
+ return;
+ }
+
+ std::setvbuf(file, nullptr, _IONBF, 0);
+}
+} // anonymous namespace
+
+namespace detail {
+void PerfMapRegister(const void* start, const void* end, std::string_view friendly_name) {
+ if (start == end) {
+ // Nothing to register
+ return;
+ }
+
+ std::lock_guard guard{mutex};
+
+ if (!file) {
+ OpenFile();
+ if (!file) {
+ return;
+ }
+ }
+
+ const std::string line = fmt::format("{:016x} {:016x} {:s}\n", reinterpret_cast<u64>(start), reinterpret_cast<u64>(end) - reinterpret_cast<u64>(start), friendly_name);
+ std::fwrite(line.data(), sizeof *line.data(), line.size(), file);
+}
+} // namespace detail
+
+void PerfMapClear() {
+ std::lock_guard guard{mutex};
+
+ if (!file) {
+ return;
+ }
+
+ std::fclose(file);
+ file = nullptr;
+ OpenFile();
+}
+
+} // namespace Dynarmic::Backend::X64
+
+#else
+
+namespace Dynarmic::Backend::X64 {
+
+namespace detail {
+void PerfMapRegister(const void*, const void*, std::string_view) {}
+} // namespace detail
+
+void PerfMapClear() {}
+
+} // namespace Dynarmic::Backend::X64
+
+#endif
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h
new file mode 100644
index 0000000000..7d739ae63e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h
@@ -0,0 +1,25 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string_view>
+
+#include <mcl/bit_cast.hpp>
+
+namespace Dynarmic::Backend::X64 {
+
+namespace detail {
+void PerfMapRegister(const void* start, const void* end, std::string_view friendly_name);
+} // namespace detail
+
+template<typename T>
+void PerfMapRegister(T start, const void* end, std::string_view friendly_name) {
+ detail::PerfMapRegister(mcl::bit_cast<const void*>(start), end, friendly_name);
+}
+
+void PerfMapClear();
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
new file mode 100644
index 0000000000..61a3a6cd91
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
@@ -0,0 +1,777 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/reg_alloc.h"
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#include <fmt/ostream.h>
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/stack_layout.h"
+#include "dynarmic/backend/x64/verbose_debugging_output.h"
+
+namespace Dynarmic::Backend::X64 {
+
+#define MAYBE_AVX(OPCODE, ...) \
+ [&] { \
+ if (code.HasHostFeature(HostFeature::AVX)) { \
+ code.v##OPCODE(__VA_ARGS__); \
+ } else { \
+ code.OPCODE(__VA_ARGS__); \
+ } \
+ }()
+
+static bool CanExchange(HostLoc a, HostLoc b) {
+ return HostLocIsGPR(a) && HostLocIsGPR(b);
+}
+
+// Minimum number of bits required to represent a type
+static size_t GetBitWidth(IR::Type type) {
+ switch (type) {
+ case IR::Type::A32Reg:
+ case IR::Type::A32ExtReg:
+ case IR::Type::A64Reg:
+ case IR::Type::A64Vec:
+ case IR::Type::CoprocInfo:
+ case IR::Type::Cond:
+ case IR::Type::Void:
+ case IR::Type::Table:
+ case IR::Type::AccType:
+ ASSERT_FALSE("Type {} cannot be represented at runtime", type);
+ case IR::Type::Opaque:
+ ASSERT_FALSE("Not a concrete type");
+ case IR::Type::U1:
+ return 8;
+ case IR::Type::U8:
+ return 8;
+ case IR::Type::U16:
+ return 16;
+ case IR::Type::U32:
+ return 32;
+ case IR::Type::U64:
+ return 64;
+ case IR::Type::U128:
+ return 128;
+ case IR::Type::NZCVFlags:
+ return 32; // TODO: Update to 16 when flags optimization is done
+ }
+ UNREACHABLE();
+}
+
+static bool IsValuelessType(IR::Type type) {
+ switch (type) {
+ case IR::Type::Table:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool HostLocInfo::IsLocked() const {
+ return is_being_used_count > 0;
+}
+
+bool HostLocInfo::IsEmpty() const {
+ return is_being_used_count == 0 && values.empty();
+}
+
+bool HostLocInfo::IsLastUse() const {
+ return is_being_used_count == 0 && current_references == 1 && accumulated_uses + 1 == total_uses;
+}
+
+void HostLocInfo::SetLastUse() {
+ ASSERT(IsLastUse());
+ is_set_last_use = true;
+}
+
+void HostLocInfo::ReadLock() {
+ ASSERT(!is_scratch);
+ is_being_used_count++;
+}
+
+void HostLocInfo::WriteLock() {
+ ASSERT(is_being_used_count == 0);
+ is_being_used_count++;
+ is_scratch = true;
+}
+
+void HostLocInfo::AddArgReference() {
+ current_references++;
+ ASSERT(accumulated_uses + current_references <= total_uses);
+}
+
+void HostLocInfo::ReleaseOne() {
+ is_being_used_count--;
+ is_scratch = false;
+
+ if (current_references == 0)
+ return;
+
+ accumulated_uses++;
+ current_references--;
+
+ if (current_references == 0)
+ ReleaseAll();
+}
+
+void HostLocInfo::ReleaseAll() {
+ accumulated_uses += current_references;
+ current_references = 0;
+
+ is_set_last_use = false;
+
+ if (total_uses == accumulated_uses) {
+ values.clear();
+ accumulated_uses = 0;
+ total_uses = 0;
+ max_bit_width = 0;
+ }
+
+ is_being_used_count = 0;
+ is_scratch = false;
+}
+
+bool HostLocInfo::ContainsValue(const IR::Inst* inst) const {
+ return std::find(values.begin(), values.end(), inst) != values.end();
+}
+
+size_t HostLocInfo::GetMaxBitWidth() const {
+ return max_bit_width;
+}
+
+void HostLocInfo::AddValue(IR::Inst* inst) {
+ if (is_set_last_use) {
+ is_set_last_use = false;
+ values.clear();
+ }
+ values.push_back(inst);
+ total_uses += inst->UseCount();
+ max_bit_width = std::max(max_bit_width, GetBitWidth(inst->GetType()));
+}
+
+void HostLocInfo::EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const {
+ using namespace Xbyak::util;
+ for (IR::Inst* value : values) {
+ code.mov(code.ABI_PARAM1, rsp);
+ code.mov(code.ABI_PARAM2, host_loc_index);
+ code.mov(code.ABI_PARAM3, value->GetName());
+ code.mov(code.ABI_PARAM4, GetBitWidth(value->GetType()));
+ code.CallFunction(PrintVerboseDebuggingOutputLine);
+ }
+}
+
+IR::Type Argument::GetType() const {
+ return value.GetType();
+}
+
+bool Argument::IsImmediate() const {
+ return value.IsImmediate();
+}
+
+bool Argument::IsVoid() const {
+ return GetType() == IR::Type::Void;
+}
+
+bool Argument::FitsInImmediateU32() const {
+ if (!IsImmediate())
+ return false;
+ const u64 imm = value.GetImmediateAsU64();
+ return imm < 0x100000000;
+}
+
+bool Argument::FitsInImmediateS32() const {
+ if (!IsImmediate())
+ return false;
+ const s64 imm = static_cast<s64>(value.GetImmediateAsU64());
+ return -s64(0x80000000) <= imm && imm <= s64(0x7FFFFFFF);
+}
+
+bool Argument::GetImmediateU1() const {
+ return value.GetU1();
+}
+
+u8 Argument::GetImmediateU8() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x100);
+ return u8(imm);
+}
+
+u16 Argument::GetImmediateU16() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x10000);
+ return u16(imm);
+}
+
+u32 Argument::GetImmediateU32() const {
+ const u64 imm = value.GetImmediateAsU64();
+ ASSERT(imm < 0x100000000);
+ return u32(imm);
+}
+
+u64 Argument::GetImmediateS32() const {
+ ASSERT(FitsInImmediateS32());
+ return value.GetImmediateAsU64();
+}
+
+u64 Argument::GetImmediateU64() const {
+ return value.GetImmediateAsU64();
+}
+
+IR::Cond Argument::GetImmediateCond() const {
+ ASSERT(IsImmediate() && GetType() == IR::Type::Cond);
+ return value.GetCond();
+}
+
+IR::AccType Argument::GetImmediateAccType() const {
+ ASSERT(IsImmediate() && GetType() == IR::Type::AccType);
+ return value.GetAccType();
+}
+
+bool Argument::IsInGpr() const {
+ if (IsImmediate())
+ return false;
+ return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+bool Argument::IsInXmm() const {
+ if (IsImmediate())
+ return false;
+ return HostLocIsXMM(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+bool Argument::IsInMemory() const {
+ if (IsImmediate())
+ return false;
+ return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+RegAlloc::RegAlloc(BlockOfCode& code, std::vector<HostLoc> gpr_order, std::vector<HostLoc> xmm_order)
+ : gpr_order(gpr_order)
+ , xmm_order(xmm_order)
+ , hostloc_info(NonSpillHostLocCount + SpillCount)
+ , code(code) {}
+
+RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) {
+ ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}};
+ for (size_t i = 0; i < inst->NumArgs(); i++) {
+ const IR::Value arg = inst->GetArg(i);
+ ret[i].value = arg;
+ if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) {
+ ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined");
+ LocInfo(*ValueLocation(arg.GetInst())).AddArgReference();
+ }
+ }
+ return ret;
+}
+
+void RegAlloc::RegisterPseudoOperation(IR::Inst* inst) {
+ ASSERT(IsValueLive(inst) || !inst->HasUses());
+
+ for (size_t i = 0; i < inst->NumArgs(); i++) {
+ const IR::Value arg = inst->GetArg(i);
+ if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) {
+ if (const auto loc = ValueLocation(arg.GetInst())) {
+ // May not necessarily have a value (e.g. CMP variant of Sub32).
+ LocInfo(*loc).AddArgReference();
+ }
+ }
+ }
+}
+
+bool RegAlloc::IsValueLive(IR::Inst* inst) const {
+ return !!ValueLocation(inst);
+}
+
+Xbyak::Reg64 RegAlloc::UseGpr(Argument& arg) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ return HostLocToReg64(UseImpl(arg.value, gpr_order));
+}
+
+Xbyak::Xmm RegAlloc::UseXmm(Argument& arg) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ return HostLocToXmm(UseImpl(arg.value, xmm_order));
+}
+
+OpArg RegAlloc::UseOpArg(Argument& arg) {
+ return UseGpr(arg);
+}
+
+void RegAlloc::Use(Argument& arg, HostLoc host_loc) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ UseImpl(arg.value, {host_loc});
+}
+
+Xbyak::Reg64 RegAlloc::UseScratchGpr(Argument& arg) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ return HostLocToReg64(UseScratchImpl(arg.value, gpr_order));
+}
+
+Xbyak::Xmm RegAlloc::UseScratchXmm(Argument& arg) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ return HostLocToXmm(UseScratchImpl(arg.value, xmm_order));
+}
+
+void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ UseScratchImpl(arg.value, {host_loc});
+}
+
+void RegAlloc::DefineValue(IR::Inst* inst, const Xbyak::Reg& reg) {
+ ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG);
+ const auto hostloc = static_cast<HostLoc>(reg.getIdx() + static_cast<size_t>(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX));
+ DefineValueImpl(inst, hostloc);
+}
+
+void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) {
+ ASSERT(!arg.allocated);
+ arg.allocated = true;
+ DefineValueImpl(inst, arg.value);
+}
+
+void RegAlloc::Release(const Xbyak::Reg& reg) {
+ ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG);
+ const auto hostloc = static_cast<HostLoc>(reg.getIdx() + static_cast<size_t>(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX));
+ LocInfo(hostloc).ReleaseOne();
+}
+
+Xbyak::Reg64 RegAlloc::ScratchGpr() {
+ return HostLocToReg64(ScratchImpl(gpr_order));
+}
+
+Xbyak::Reg64 RegAlloc::ScratchGpr(HostLoc desired_location) {
+ return HostLocToReg64(ScratchImpl({desired_location}));
+}
+
+Xbyak::Xmm RegAlloc::ScratchXmm() {
+ return HostLocToXmm(ScratchImpl(xmm_order));
+}
+
+Xbyak::Xmm RegAlloc::ScratchXmm(HostLoc desired_location) {
+ return HostLocToXmm(ScratchImpl({desired_location}));
+}
+
+HostLoc RegAlloc::UseImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations) {
+ if (use_value.IsImmediate()) {
+ return LoadImmediate(use_value, ScratchImpl(desired_locations));
+ }
+
+ const IR::Inst* use_inst = use_value.GetInst();
+ const HostLoc current_location = *ValueLocation(use_inst);
+ const size_t max_bit_width = LocInfo(current_location).GetMaxBitWidth();
+
+ const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
+ if (can_use_current_location) {
+ LocInfo(current_location).ReadLock();
+ return current_location;
+ }
+
+ if (LocInfo(current_location).IsLocked()) {
+ return UseScratchImpl(use_value, desired_locations);
+ }
+
+ const HostLoc destination_location = SelectARegister(desired_locations);
+ if (max_bit_width > HostLocBitWidth(destination_location)) {
+ return UseScratchImpl(use_value, desired_locations);
+ } else if (CanExchange(destination_location, current_location)) {
+ Exchange(destination_location, current_location);
+ } else {
+ MoveOutOfTheWay(destination_location);
+ Move(destination_location, current_location);
+ }
+ LocInfo(destination_location).ReadLock();
+ return destination_location;
+}
+
+HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations) {
+ if (use_value.IsImmediate()) {
+ return LoadImmediate(use_value, ScratchImpl(desired_locations));
+ }
+
+ const IR::Inst* use_inst = use_value.GetInst();
+ const HostLoc current_location = *ValueLocation(use_inst);
+ const size_t bit_width = GetBitWidth(use_inst->GetType());
+
+ const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
+ if (can_use_current_location && !LocInfo(current_location).IsLocked()) {
+ if (!LocInfo(current_location).IsLastUse()) {
+ MoveOutOfTheWay(current_location);
+ } else {
+ LocInfo(current_location).SetLastUse();
+ }
+ LocInfo(current_location).WriteLock();
+ return current_location;
+ }
+
+ const HostLoc destination_location = SelectARegister(desired_locations);
+ MoveOutOfTheWay(destination_location);
+ CopyToScratch(bit_width, destination_location, current_location);
+ LocInfo(destination_location).WriteLock();
+ return destination_location;
+}
+
+HostLoc RegAlloc::ScratchImpl(const std::vector<HostLoc>& desired_locations) {
+ const HostLoc location = SelectARegister(desired_locations);
+ MoveOutOfTheWay(location);
+ LocInfo(location).WriteLock();
+ return location;
+}
+
+void RegAlloc::HostCall(IR::Inst* result_def,
+ std::optional<Argument::copyable_reference> arg0,
+ std::optional<Argument::copyable_reference> arg1,
+ std::optional<Argument::copyable_reference> arg2,
+ std::optional<Argument::copyable_reference> arg3) {
+ constexpr size_t args_count = 4;
+ constexpr std::array<HostLoc, args_count> args_hostloc = {ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4};
+ const std::array<std::optional<Argument::copyable_reference>, args_count> args = {arg0, arg1, arg2, arg3};
+
+ static const std::vector<HostLoc> other_caller_save = [args_hostloc]() {
+ std::vector<HostLoc> ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end());
+
+ ret.erase(std::find(ret.begin(), ret.end(), ABI_RETURN));
+ for (auto hostloc : args_hostloc) {
+ ret.erase(std::find(ret.begin(), ret.end(), hostloc));
+ }
+
+ return ret;
+ }();
+
+ ScratchGpr(ABI_RETURN);
+ if (result_def) {
+ DefineValueImpl(result_def, ABI_RETURN);
+ }
+
+ for (size_t i = 0; i < args_count; i++) {
+ if (args[i] && !args[i]->get().IsVoid()) {
+ UseScratch(*args[i], args_hostloc[i]);
+
+ // LLVM puts the burden of zero-extension of 8 and 16 bit values on the caller instead of the callee
+ const Xbyak::Reg64 reg = HostLocToReg64(args_hostloc[i]);
+ switch (args[i]->get().GetType()) {
+ case IR::Type::U8:
+ code.movzx(reg.cvt32(), reg.cvt8());
+ break;
+ case IR::Type::U16:
+ code.movzx(reg.cvt32(), reg.cvt16());
+ break;
+ case IR::Type::U32:
+ code.mov(reg.cvt32(), reg.cvt32());
+ break;
+ default:
+ break; // Nothing needs to be done
+ }
+ }
+ }
+
+ for (size_t i = 0; i < args_count; i++) {
+ if (!args[i]) {
+ // TODO: Force spill
+ ScratchGpr(args_hostloc[i]);
+ }
+ }
+
+ for (HostLoc caller_saved : other_caller_save) {
+ ScratchImpl({caller_saved});
+ }
+}
+
+void RegAlloc::AllocStackSpace(size_t stack_space) {
+ ASSERT(stack_space < static_cast<size_t>(std::numeric_limits<s32>::max()));
+ ASSERT(reserved_stack_space == 0);
+ reserved_stack_space = stack_space;
+ code.sub(code.rsp, static_cast<u32>(stack_space));
+}
+
+void RegAlloc::ReleaseStackSpace(size_t stack_space) {
+ ASSERT(stack_space < static_cast<size_t>(std::numeric_limits<s32>::max()));
+ ASSERT(reserved_stack_space == stack_space);
+ reserved_stack_space = 0;
+ code.add(code.rsp, static_cast<u32>(stack_space));
+}
+
+void RegAlloc::EndOfAllocScope() {
+ for (auto& iter : hostloc_info) {
+ iter.ReleaseAll();
+ }
+}
+
+void RegAlloc::AssertNoMoreUses() {
+ ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) { return i.IsEmpty(); }));
+}
+
+void RegAlloc::EmitVerboseDebuggingOutput() {
+ for (size_t i = 0; i < hostloc_info.size(); i++) {
+ hostloc_info[i].EmitVerboseDebuggingOutput(code, i);
+ }
+}
+
+HostLoc RegAlloc::SelectARegister(const std::vector<HostLoc>& desired_locations) const {
+ std::vector<HostLoc> candidates = desired_locations;
+
+ // Find all locations that have not been allocated..
+ const auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc) {
+ return !this->LocInfo(loc).IsLocked();
+ });
+ candidates.erase(allocated_locs, candidates.end());
+ ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated");
+
+ // Selects the best location out of the available locations.
+ // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible.
+
+ std::partition(candidates.begin(), candidates.end(), [this](auto loc) {
+ return this->LocInfo(loc).IsEmpty();
+ });
+
+ return candidates.front();
+}
+
+std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const {
+ for (size_t i = 0; i < hostloc_info.size(); i++) {
+ if (hostloc_info[i].ContainsValue(value)) {
+ return static_cast<HostLoc>(i);
+ }
+ }
+
+ return std::nullopt;
+}
+
+void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) {
+ ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined");
+ LocInfo(host_loc).AddValue(def_inst);
+}
+
+void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) {
+ ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined");
+
+ if (use_inst.IsImmediate()) {
+ const HostLoc location = ScratchImpl(gpr_order);
+ DefineValueImpl(def_inst, location);
+ LoadImmediate(use_inst, location);
+ return;
+ }
+
+ ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined");
+ const HostLoc location = *ValueLocation(use_inst.GetInst());
+ DefineValueImpl(def_inst, location);
+}
+
+HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) {
+ ASSERT_MSG(imm.IsImmediate(), "imm is not an immediate");
+
+ if (HostLocIsGPR(host_loc)) {
+ const Xbyak::Reg64 reg = HostLocToReg64(host_loc);
+ const u64 imm_value = imm.GetImmediateAsU64();
+ if (imm_value == 0) {
+ code.xor_(reg.cvt32(), reg.cvt32());
+ } else {
+ code.mov(reg, imm_value);
+ }
+ return host_loc;
+ }
+
+ if (HostLocIsXMM(host_loc)) {
+ const Xbyak::Xmm reg = HostLocToXmm(host_loc);
+ const u64 imm_value = imm.GetImmediateAsU64();
+ if (imm_value == 0) {
+ MAYBE_AVX(xorps, reg, reg);
+ } else {
+ MAYBE_AVX(movaps, reg, code.Const(code.xword, imm_value));
+ }
+ return host_loc;
+ }
+
+ UNREACHABLE();
+}
+
+void RegAlloc::Move(HostLoc to, HostLoc from) {
+ const size_t bit_width = LocInfo(from).GetMaxBitWidth();
+
+ ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
+ ASSERT(bit_width <= HostLocBitWidth(to));
+
+ if (LocInfo(from).IsEmpty()) {
+ return;
+ }
+
+ EmitMove(bit_width, to, from);
+
+ LocInfo(to) = std::exchange(LocInfo(from), {});
+}
+
+void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) {
+ ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty());
+
+ EmitMove(bit_width, to, from);
+}
+
+void RegAlloc::Exchange(HostLoc a, HostLoc b) {
+ ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked());
+ ASSERT(LocInfo(a).GetMaxBitWidth() <= HostLocBitWidth(b));
+ ASSERT(LocInfo(b).GetMaxBitWidth() <= HostLocBitWidth(a));
+
+ if (LocInfo(a).IsEmpty()) {
+ Move(a, b);
+ return;
+ }
+
+ if (LocInfo(b).IsEmpty()) {
+ Move(b, a);
+ return;
+ }
+
+ EmitExchange(a, b);
+
+ std::swap(LocInfo(a), LocInfo(b));
+}
+
+void RegAlloc::MoveOutOfTheWay(HostLoc reg) {
+ ASSERT(!LocInfo(reg).IsLocked());
+ if (!LocInfo(reg).IsEmpty()) {
+ SpillRegister(reg);
+ }
+}
+
+void RegAlloc::SpillRegister(HostLoc loc) {
+ ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled");
+ ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers");
+ ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt");
+
+ const HostLoc new_loc = FindFreeSpill();
+ Move(new_loc, loc);
+}
+
+HostLoc RegAlloc::FindFreeSpill() const {
+ for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) {
+ const auto loc = static_cast<HostLoc>(i);
+ if (LocInfo(loc).IsEmpty()) {
+ return loc;
+ }
+ }
+
+ ASSERT_FALSE("All spill locations are full");
+}
+
+HostLocInfo& RegAlloc::LocInfo(HostLoc loc) {
+ ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15);
+ return hostloc_info[static_cast<size_t>(loc)];
+}
+
+const HostLocInfo& RegAlloc::LocInfo(HostLoc loc) const {
+ ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15);
+ return hostloc_info[static_cast<size_t>(loc)];
+}
+
+void RegAlloc::EmitMove(size_t bit_width, HostLoc to, HostLoc from) {
+ if (HostLocIsXMM(to) && HostLocIsXMM(from)) {
+ MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from));
+ } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
+ ASSERT(bit_width != 128);
+ if (bit_width == 64) {
+ code.mov(HostLocToReg64(to), HostLocToReg64(from));
+ } else {
+ code.mov(HostLocToReg64(to).cvt32(), HostLocToReg64(from).cvt32());
+ }
+ } else if (HostLocIsXMM(to) && HostLocIsGPR(from)) {
+ ASSERT(bit_width != 128);
+ if (bit_width == 64) {
+ MAYBE_AVX(movq, HostLocToXmm(to), HostLocToReg64(from));
+ } else {
+ MAYBE_AVX(movd, HostLocToXmm(to), HostLocToReg64(from).cvt32());
+ }
+ } else if (HostLocIsGPR(to) && HostLocIsXMM(from)) {
+ ASSERT(bit_width != 128);
+ if (bit_width == 64) {
+ MAYBE_AVX(movq, HostLocToReg64(to), HostLocToXmm(from));
+ } else {
+ MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from));
+ }
+ } else if (HostLocIsXMM(to) && HostLocIsSpill(from)) {
+ const Xbyak::Address spill_addr = SpillToOpArg(from);
+ ASSERT(spill_addr.getBit() >= bit_width);
+ switch (bit_width) {
+ case 128:
+ MAYBE_AVX(movaps, HostLocToXmm(to), spill_addr);
+ break;
+ case 64:
+ MAYBE_AVX(movsd, HostLocToXmm(to), spill_addr);
+ break;
+ case 32:
+ case 16:
+ case 8:
+ MAYBE_AVX(movss, HostLocToXmm(to), spill_addr);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ } else if (HostLocIsSpill(to) && HostLocIsXMM(from)) {
+ const Xbyak::Address spill_addr = SpillToOpArg(to);
+ ASSERT(spill_addr.getBit() >= bit_width);
+ switch (bit_width) {
+ case 128:
+ MAYBE_AVX(movaps, spill_addr, HostLocToXmm(from));
+ break;
+ case 64:
+ MAYBE_AVX(movsd, spill_addr, HostLocToXmm(from));
+ break;
+ case 32:
+ case 16:
+ case 8:
+ MAYBE_AVX(movss, spill_addr, HostLocToXmm(from));
+ break;
+ default:
+ UNREACHABLE();
+ }
+ } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
+ ASSERT(bit_width != 128);
+ if (bit_width == 64) {
+ code.mov(HostLocToReg64(to), SpillToOpArg(from));
+ } else {
+ code.mov(HostLocToReg64(to).cvt32(), SpillToOpArg(from));
+ }
+ } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
+ ASSERT(bit_width != 128);
+ if (bit_width == 64) {
+ code.mov(SpillToOpArg(to), HostLocToReg64(from));
+ } else {
+ code.mov(SpillToOpArg(to), HostLocToReg64(from).cvt32());
+ }
+ } else {
+ ASSERT_FALSE("Invalid RegAlloc::EmitMove");
+ }
+}
+
+void RegAlloc::EmitExchange(HostLoc a, HostLoc b) {
+ if (HostLocIsGPR(a) && HostLocIsGPR(b)) {
+ code.xchg(HostLocToReg64(a), HostLocToReg64(b));
+ } else if (HostLocIsXMM(a) && HostLocIsXMM(b)) {
+ ASSERT_FALSE("Check your code: Exchanging XMM registers is unnecessary");
+ } else {
+ ASSERT_FALSE("Invalid RegAlloc::EmitExchange");
+ }
+}
+
+Xbyak::Address RegAlloc::SpillToOpArg(HostLoc loc) {
+ ASSERT(HostLocIsSpill(loc));
+
+ size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill);
+ ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations");
+
+ using namespace Xbyak::util;
+ return xword[rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0])];
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h
new file mode 100644
index 0000000000..19debf5f48
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h
@@ -0,0 +1,188 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/hostloc.h"
+#include "dynarmic/backend/x64/oparg.h"
+#include "dynarmic/ir/cond.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::IR {
+enum class AccType;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::Backend::X64 {
+
+class RegAlloc;
+
+struct HostLocInfo {
+public:
+ bool IsLocked() const;
+ bool IsEmpty() const;
+ bool IsLastUse() const;
+
+ void SetLastUse();
+
+ void ReadLock();
+ void WriteLock();
+ void AddArgReference();
+ void ReleaseOne();
+ void ReleaseAll();
+
+ bool ContainsValue(const IR::Inst* inst) const;
+ size_t GetMaxBitWidth() const;
+
+ void AddValue(IR::Inst* inst);
+
+ void EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const;
+
+private:
+ // Current instruction state
+ size_t is_being_used_count = 0;
+ bool is_scratch = false;
+ bool is_set_last_use = false;
+
+ // Block state
+ size_t current_references = 0;
+ size_t accumulated_uses = 0;
+ size_t total_uses = 0;
+
+ // Value state
+ std::vector<IR::Inst*> values;
+ size_t max_bit_width = 0;
+};
+
+struct Argument {
+public:
+ using copyable_reference = std::reference_wrapper<Argument>;
+
+ IR::Type GetType() const;
+ bool IsImmediate() const;
+ bool IsVoid() const;
+
+ bool FitsInImmediateU32() const;
+ bool FitsInImmediateS32() const;
+
+ bool GetImmediateU1() const;
+ u8 GetImmediateU8() const;
+ u16 GetImmediateU16() const;
+ u32 GetImmediateU32() const;
+ u64 GetImmediateS32() const;
+ u64 GetImmediateU64() const;
+ IR::Cond GetImmediateCond() const;
+ IR::AccType GetImmediateAccType() const;
+
+ /// Is this value currently in a GPR?
+ bool IsInGpr() const;
+ /// Is this value currently in a XMM?
+ bool IsInXmm() const;
+ /// Is this value currently in memory?
+ bool IsInMemory() const;
+
+private:
+ friend class RegAlloc;
+ explicit Argument(RegAlloc& reg_alloc)
+ : reg_alloc(reg_alloc) {}
+
+ bool allocated = false;
+ RegAlloc& reg_alloc;
+ IR::Value value;
+};
+
+class RegAlloc final {
+public:
+ using ArgumentInfo = std::array<Argument, IR::max_arg_count>;
+
+ explicit RegAlloc(BlockOfCode& code, std::vector<HostLoc> gpr_order, std::vector<HostLoc> xmm_order);
+
+ ArgumentInfo GetArgumentInfo(IR::Inst* inst);
+ void RegisterPseudoOperation(IR::Inst* inst);
+ bool IsValueLive(IR::Inst* inst) const;
+
+ Xbyak::Reg64 UseGpr(Argument& arg);
+ Xbyak::Xmm UseXmm(Argument& arg);
+ OpArg UseOpArg(Argument& arg);
+ void Use(Argument& arg, HostLoc host_loc);
+
+ Xbyak::Reg64 UseScratchGpr(Argument& arg);
+ Xbyak::Xmm UseScratchXmm(Argument& arg);
+ void UseScratch(Argument& arg, HostLoc host_loc);
+
+ void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg);
+ void DefineValue(IR::Inst* inst, Argument& arg);
+
+ void Release(const Xbyak::Reg& reg);
+
+ Xbyak::Reg64 ScratchGpr();
+ Xbyak::Reg64 ScratchGpr(HostLoc desired_location);
+ Xbyak::Xmm ScratchXmm();
+ Xbyak::Xmm ScratchXmm(HostLoc desired_location);
+
+ void HostCall(IR::Inst* result_def = nullptr,
+ std::optional<Argument::copyable_reference> arg0 = {},
+ std::optional<Argument::copyable_reference> arg1 = {},
+ std::optional<Argument::copyable_reference> arg2 = {},
+ std::optional<Argument::copyable_reference> arg3 = {});
+
+ // TODO: Values in host flags
+
+ void AllocStackSpace(size_t stack_space);
+ void ReleaseStackSpace(size_t stack_space);
+
+ void EndOfAllocScope();
+
+ void AssertNoMoreUses();
+
+ void EmitVerboseDebuggingOutput();
+
+private:
+ friend struct Argument;
+
+ std::vector<HostLoc> gpr_order;
+ std::vector<HostLoc> xmm_order;
+
+ HostLoc SelectARegister(const std::vector<HostLoc>& desired_locations) const;
+ std::optional<HostLoc> ValueLocation(const IR::Inst* value) const;
+
+ HostLoc UseImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations);
+ HostLoc UseScratchImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations);
+ HostLoc ScratchImpl(const std::vector<HostLoc>& desired_locations);
+ void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc);
+ void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst);
+
+ HostLoc LoadImmediate(IR::Value imm, HostLoc host_loc);
+ void Move(HostLoc to, HostLoc from);
+ void CopyToScratch(size_t bit_width, HostLoc to, HostLoc from);
+ void Exchange(HostLoc a, HostLoc b);
+ void MoveOutOfTheWay(HostLoc reg);
+
+ void SpillRegister(HostLoc loc);
+ HostLoc FindFreeSpill() const;
+
+ std::vector<HostLocInfo> hostloc_info;
+ HostLocInfo& LocInfo(HostLoc loc);
+ const HostLocInfo& LocInfo(HostLoc loc) const;
+
+ BlockOfCode& code;
+ size_t reserved_stack_space = 0;
+ void EmitMove(size_t bit_width, HostLoc to, HostLoc from);
+ void EmitExchange(HostLoc a, HostLoc b);
+
+ Xbyak::Address SpillToOpArg(HostLoc loc);
+};
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h b/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h
new file mode 100644
index 0000000000..7e6799f1cf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h
@@ -0,0 +1,38 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Backend::X64 {
+
+constexpr size_t SpillCount = 64;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+struct alignas(16) StackLayout {
+ s64 cycles_remaining;
+ s64 cycles_to_run;
+
+ std::array<std::array<u64, 2>, SpillCount> spill;
+
+ u32 save_host_MXCSR;
+
+ bool check_bit;
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+static_assert(sizeof(StackLayout) % 16 == 0);
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp
new file mode 100644
index 0000000000..3378786c46
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp
@@ -0,0 +1,56 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/verbose_debugging_output.h"
+
+#include <iterator>
+
+#include <fmt/format.h>
+
+#include "dynarmic/backend/x64/hostloc.h"
+
+namespace Dynarmic::Backend::X64 {
+
+void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, size_t inst_index, size_t bitsize) {
+ fmt::print("dynarmic debug: %{:05} = ", inst_index);
+
+ Vector value = [&]() -> Vector {
+ if (HostLocIsGPR(hostloc)) {
+ return {reg_data.gprs[HostLocToReg64(hostloc).getIdx()], 0};
+ } else if (HostLocIsXMM(hostloc)) {
+ return reg_data.xmms[HostLocToXmm(hostloc).getIdx()];
+ } else if (HostLocIsSpill(hostloc)) {
+ return (*reg_data.spill)[static_cast<size_t>(hostloc) - static_cast<size_t>(HostLoc::FirstSpill)];
+ } else {
+ fmt::print("invalid hostloc! ");
+ return {0, 0};
+ }
+ }();
+
+ switch (bitsize) {
+ case 8:
+ fmt::print("{:02x}", value[0] & 0xff);
+ break;
+ case 16:
+ fmt::print("{:04x}", value[0] & 0xffff);
+ break;
+ case 32:
+ fmt::print("{:08x}", value[0] & 0xffffffff);
+ break;
+ case 64:
+ fmt::print("{:016x}", value[0]);
+ break;
+ case 128:
+ fmt::print("{:016x}{:016x}", value[1], value[0]);
+ break;
+ default:
+ fmt::print("invalid bitsize!");
+ break;
+ }
+
+ fmt::print("\n");
+}
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h
new file mode 100644
index 0000000000..4eb2646d77
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h
@@ -0,0 +1,37 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/backend/x64/stack_layout.h"
+
+namespace Dynarmic::Backend::X64 {
+
+enum class HostLoc;
+using Vector = std::array<u64, 2>;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4324) // Structure was padded due to alignment specifier
+#endif
+
+struct alignas(16) RegisterData {
+ std::array<u64, 16> gprs;
+ std::array<Vector, 16> xmms;
+ decltype(StackLayout::spill)* spill;
+ u32 mxcsr;
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, size_t inst_index, size_t bitsize);
+
+} // namespace Dynarmic::Backend::X64
diff --git a/externals/dynarmic/src/dynarmic/common/always_false.h b/externals/dynarmic/src/dynarmic/common/always_false.h
new file mode 100644
index 0000000000..b980fea00c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/always_false.h
@@ -0,0 +1,13 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::Common {
+
+template<typename T>
+inline constexpr bool always_false_v = false;
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/atomic.h b/externals/dynarmic/src/dynarmic/common/atomic.h
new file mode 100644
index 0000000000..34042d3802
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/atomic.h
@@ -0,0 +1,44 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Atomic {
+
+inline u32 Load(volatile u32* ptr) {
+#ifdef _MSC_VER
+ return _InterlockedOr(reinterpret_cast<volatile long*>(ptr), 0);
+#else
+ return __atomic_load_n(ptr, __ATOMIC_SEQ_CST);
+#endif
+}
+
+inline void Or(volatile u32* ptr, u32 value) {
+#ifdef _MSC_VER
+ _InterlockedOr(reinterpret_cast<volatile long*>(ptr), value);
+#else
+ __atomic_or_fetch(ptr, value, __ATOMIC_SEQ_CST);
+#endif
+}
+
+inline void And(volatile u32* ptr, u32 value) {
+#ifdef _MSC_VER
+ _InterlockedAnd(reinterpret_cast<volatile long*>(ptr), value);
+#else
+ __atomic_and_fetch(ptr, value, __ATOMIC_SEQ_CST);
+#endif
+}
+
+inline void Barrier() {
+#ifdef _MSC_VER
+ _ReadWriteBarrier();
+#else
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#endif
+}
+
+} // namespace Dynarmic::Atomic
diff --git a/externals/dynarmic/src/dynarmic/common/cast_util.h b/externals/dynarmic/src/dynarmic/common/cast_util.h
new file mode 100644
index 0000000000..92c9a259b3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/cast_util.h
@@ -0,0 +1,18 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/type_traits/function_info.hpp>
+
+namespace Dynarmic::Common {
+
+/// Cast a lambda into an equivalent function pointer.
+template<class Function>
+inline auto FptrCast(Function f) noexcept {
+ return static_cast<mcl::equivalent_function_type<Function>*>(f);
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp b/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp
new file mode 100644
index 0000000000..c431758e57
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp
@@ -0,0 +1,180 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/crypto/aes.h"
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::AES {
+
+using SubstitutionTable = std::array<u8, 256>;
+
+// See section 5.1.1 Figure 7 in FIPS 197
+constexpr SubstitutionTable substitution_box{
+ {// 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+ 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+ 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+ 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+ 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+ 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+ 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+ 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+ 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+ 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+ 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+ 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+ 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+ 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+ 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+ 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16}};
+
+// See section 5.3.2 Figure 14 in FIPS 197
+constexpr SubstitutionTable inverse_substitution_box{
+ {// 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
+ 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
+ 0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+ 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
+ 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
+ 0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+ 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
+ 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
+ 0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+ 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
+ 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
+ 0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+ 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
+ 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
+ 0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+ 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D}};
+
+// See section 4.2.1 in FIPS 197.
+static constexpr u8 xtime(u8 x) {
+ return static_cast<u8>((x << 1) ^ (((x >> 7) & 1) * 0x1B));
+}
+
+// Galois Field multiplication.
+static constexpr u8 Multiply(u8 x, u8 y) {
+ return static_cast<u8>(((y & 1) * x)
+ ^ ((y >> 1 & 1) * xtime(x))
+ ^ ((y >> 2 & 1) * xtime(xtime(x)))
+ ^ ((y >> 3 & 1) * xtime(xtime(xtime(x))))
+ ^ ((y >> 4 & 1) * xtime(xtime(xtime(xtime(x))))));
+}
+
+static void ShiftRows(State& out_state, const State& state) {
+ // Move zeroth row over
+ out_state[0] = state[0];
+ out_state[4] = state[4];
+ out_state[8] = state[8];
+ out_state[12] = state[12];
+
+ // Rotate first row 1 column left.
+ u8 temp = state[1];
+ out_state[1] = state[5];
+ out_state[5] = state[9];
+ out_state[9] = state[13];
+ out_state[13] = temp;
+
+ // Rotate second row 2 columns left
+ temp = state[2];
+ out_state[2] = state[10];
+ out_state[10] = temp;
+
+ temp = state[6];
+ out_state[6] = state[14];
+ out_state[14] = temp;
+
+ // Rotate third row 3 columns left
+ temp = state[3];
+ out_state[3] = state[15];
+ out_state[15] = state[11];
+ out_state[11] = state[7];
+ out_state[7] = temp;
+}
+
+static void InverseShiftRows(State& out_state, const State& state) {
+ // Move zeroth row over
+ out_state[0] = state[0];
+ out_state[4] = state[4];
+ out_state[8] = state[8];
+ out_state[12] = state[12];
+
+ // Rotate first row 1 column right.
+ u8 temp = state[13];
+ out_state[13] = state[9];
+ out_state[9] = state[5];
+ out_state[5] = state[1];
+ out_state[1] = temp;
+
+ // Rotate second row 2 columns right
+ temp = state[2];
+ out_state[2] = state[10];
+ out_state[10] = temp;
+
+ temp = state[6];
+ out_state[6] = state[14];
+ out_state[14] = temp;
+
+ // Rotate third row 3 columns right
+ temp = state[3];
+ out_state[3] = state[7];
+ out_state[7] = state[11];
+ out_state[11] = state[15];
+ out_state[15] = temp;
+}
+
+static void SubBytes(State& state, const SubstitutionTable& table) {
+ for (size_t i = 0; i < 4; i++) {
+ for (size_t j = 0; j < 4; j++) {
+ state[4 * i + j] = table[state[4 * i + j]];
+ }
+ }
+}
+
+void DecryptSingleRound(State& out_state, const State& state) {
+ InverseShiftRows(out_state, state);
+ SubBytes(out_state, inverse_substitution_box);
+}
+
+void EncryptSingleRound(State& out_state, const State& state) {
+ ShiftRows(out_state, state);
+ SubBytes(out_state, substitution_box);
+}
+
+void MixColumns(State& out_state, const State& state) {
+ for (size_t i = 0; i < out_state.size(); i += 4) {
+ const u8 a = state[i];
+ const u8 b = state[i + 1];
+ const u8 c = state[i + 2];
+ const u8 d = state[i + 3];
+
+ const u8 tmp = a ^ b ^ c ^ d;
+
+ out_state[i + 0] = a ^ xtime(a ^ b) ^ tmp;
+ out_state[i + 1] = b ^ xtime(b ^ c) ^ tmp;
+ out_state[i + 2] = c ^ xtime(c ^ d) ^ tmp;
+ out_state[i + 3] = d ^ xtime(d ^ a) ^ tmp;
+ }
+}
+
+void InverseMixColumns(State& out_state, const State& state) {
+ for (size_t i = 0; i < out_state.size(); i += 4) {
+ const u8 a = state[i];
+ const u8 b = state[i + 1];
+ const u8 c = state[i + 2];
+ const u8 d = state[i + 3];
+
+ out_state[i + 0] = Multiply(a, 0x0E) ^ Multiply(b, 0x0B) ^ Multiply(c, 0x0D) ^ Multiply(d, 0x09);
+ out_state[i + 1] = Multiply(a, 0x09) ^ Multiply(b, 0x0E) ^ Multiply(c, 0x0B) ^ Multiply(d, 0x0D);
+ out_state[i + 2] = Multiply(a, 0x0D) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0E) ^ Multiply(d, 0x0B);
+ out_state[i + 3] = Multiply(a, 0x0B) ^ Multiply(b, 0x0D) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0E);
+ }
+}
+
+} // namespace Dynarmic::Common::Crypto::AES
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/aes.h b/externals/dynarmic/src/dynarmic/common/crypto/aes.h
new file mode 100644
index 0000000000..fa6d5a81b3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/aes.h
@@ -0,0 +1,23 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::AES {
+
+using State = std::array<u8, 16>;
+
+// Assumes the state has already been XORed by the round key.
+void DecryptSingleRound(State& out_state, const State& state);
+void EncryptSingleRound(State& out_state, const State& state);
+
+void MixColumns(State& out_state, const State& state);
+void InverseMixColumns(State& out_state, const State& state);
+
+} // namespace Dynarmic::Common::Crypto::AES
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp b/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp
new file mode 100644
index 0000000000..c00385078b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp
@@ -0,0 +1,168 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/crypto/crc32.h"
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::CRC32 {
+
+using CRC32Table = std::array<u32, 256>;
+
+// CRC32 algorithm that uses polynomial 0x1EDC6F41
+constexpr CRC32Table castagnoli_table{
+ {0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351}};
+
+// CRC32 algorithm that uses polynomial 0x04C11DB7
+constexpr CRC32Table iso_table{
+ {0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
+ 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+ 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+ 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+ 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
+ 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+ 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+ 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+ 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+ 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+ 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
+ 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+ 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
+ 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+ 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+ 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+ 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
+ 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+ 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
+ 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+ 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+ 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+ 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
+ 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+ 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
+ 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+ 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+ 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+ 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
+ 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+ 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
+ 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+ 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+ 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+ 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+ 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+ 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
+ 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+ 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+ 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+ 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
+ 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+ 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
+ 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+ 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+ 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+ 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
+ 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+ 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+ 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+ 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+ 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+ 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
+ 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+ 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
+ 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+ 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+ 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+ 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
+ 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+ 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
+ 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+ 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+ 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D}};
+
+static u32 ComputeCRC32(const CRC32Table& table, u32 crc, const u64 value, int length) {
+ const auto* data = reinterpret_cast<const unsigned char*>(&value);
+
+ while (length-- > 0) {
+ crc = (crc >> 8) ^ table[(crc ^ (*data++)) & 0xFF];
+ }
+
+ return crc;
+}
+
+u32 ComputeCRC32Castagnoli(u32 crc, u64 value, int length) {
+ return ComputeCRC32(castagnoli_table, crc, value, length);
+}
+
+u32 ComputeCRC32ISO(u32 crc, u64 value, int length) {
+ return ComputeCRC32(iso_table, crc, value, length);
+}
+
+} // namespace Dynarmic::Common::Crypto::CRC32
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/crc32.h b/externals/dynarmic/src/dynarmic/common/crypto/crc32.h
new file mode 100644
index 0000000000..30942327d4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/crc32.h
@@ -0,0 +1,40 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::CRC32 {
+
+/**
+ * Computes a CRC32 value using Castagnoli polynomial (0x1EDC6F41).
+ *
+ * @param crc The initial CRC value
+ * @param value The value to compute the CRC of.
+ * @param length The length of the data to compute.
+ *
+ * @remark @p value is interpreted internally as an array of
+ * unsigned char with @p length data.
+ *
+ * @return The computed CRC32 value.
+ */
+u32 ComputeCRC32Castagnoli(u32 crc, u64 value, int length);
+
+/**
+ * Computes a CRC32 value using the ISO polynomial (0x04C11DB7).
+ *
+ * @param crc The initial CRC value
+ * @param value The value to compute the CRC of.
+ * @param length The length of the data to compute.
+ *
+ * @remark @p value is interpreted internally as an array of
+ * unsigned char with @p length data.
+ *
+ * @return The computed CRC32 value.
+ */
+u32 ComputeCRC32ISO(u32 crc, u64 value, int length);
+
+} // namespace Dynarmic::Common::Crypto::CRC32
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp b/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp
new file mode 100644
index 0000000000..5743e5be9a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp
@@ -0,0 +1,54 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/crypto/sm4.h"
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::SM4 {
+
+using SubstitutionTable = std::array<u8, 256>;
+
+constexpr SubstitutionTable substitution_box{
+ {0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7,
+ 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05,
+ 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3,
+ 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+ 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A,
+ 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62,
+ 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
+ 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6,
+ 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA,
+ 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
+ 0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B,
+ 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35,
+ 0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2,
+ 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
+ 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52,
+ 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E,
+ 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5,
+ 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1,
+ 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55,
+ 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
+ 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
+ 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F,
+ 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F,
+ 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51,
+ 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F,
+ 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8,
+ 0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD,
+ 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
+ 0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E,
+ 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
+ 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20,
+ 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48}};
+
+u8 AccessSubstitutionBox(u8 index) {
+ return substitution_box[index];
+}
+
+} // namespace Dynarmic::Common::Crypto::SM4
diff --git a/externals/dynarmic/src/dynarmic/common/crypto/sm4.h b/externals/dynarmic/src/dynarmic/common/crypto/sm4.h
new file mode 100644
index 0000000000..417e9b9263
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/crypto/sm4.h
@@ -0,0 +1,14 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common::Crypto::SM4 {
+
+u8 AccessSubstitutionBox(u8 index);
+
+} // namespace Dynarmic::Common::Crypto::SM4
diff --git a/externals/dynarmic/src/dynarmic/common/fp/fpcr.h b/externals/dynarmic/src/dynarmic/common/fp/fpcr.h
new file mode 100644
index 0000000000..287f6d2889
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/fpcr.h
@@ -0,0 +1,209 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/rounding_mode.h"
+
+namespace Dynarmic::FP {
+
+/**
+ * Representation of the Floating-Point Control Register.
+ */
+class FPCR final {
+public:
+ FPCR() = default;
+ FPCR(const FPCR&) = default;
+ FPCR(FPCR&&) = default;
+ explicit FPCR(u32 data)
+ : value{data & mask} {}
+
+ FPCR& operator=(const FPCR&) = default;
+ FPCR& operator=(FPCR&&) = default;
+ FPCR& operator=(u32 data) {
+ value = data & mask;
+ return *this;
+ }
+
+ /// Get alternate half-precision control flag.
+ bool AHP() const {
+ return mcl::bit::get_bit<26>(value);
+ }
+
+ /// Set alternate half-precision control flag.
+ void AHP(bool ahp) {
+ value = mcl::bit::set_bit<26>(value, ahp);
+ }
+
+ /// Get default NaN mode control bit.
+ bool DN() const {
+ return mcl::bit::get_bit<25>(value);
+ }
+
+ /// Set default NaN mode control bit.
+ void DN(bool dn) {
+ value = mcl::bit::set_bit<25>(value, dn);
+ }
+
+ /// Get flush-to-zero mode control bit.
+ bool FZ() const {
+ return mcl::bit::get_bit<24>(value);
+ }
+
+ /// Set flush-to-zero mode control bit.
+ void FZ(bool fz) {
+ value = mcl::bit::set_bit<24>(value, fz);
+ }
+
+ /// Get rounding mode control field.
+ FP::RoundingMode RMode() const {
+ return static_cast<FP::RoundingMode>(mcl::bit::get_bits<22, 23>(value));
+ }
+
+ /// Set rounding mode control field.
+ void RMode(FP::RoundingMode rounding_mode) {
+ ASSERT_MSG(static_cast<u32>(rounding_mode) <= 0b11, "FPCR: Invalid rounding mode");
+ value = mcl::bit::set_bits<22, 23>(value, static_cast<u32>(rounding_mode));
+ }
+
+ /// Get the stride of a vector when executing AArch32 VFP instructions.
+ /// This field has no function in AArch64 state.
+ std::optional<size_t> Stride() const {
+ switch (mcl::bit::get_bits<20, 21>(value)) {
+ case 0b00:
+ return 1;
+ case 0b11:
+ return 2;
+ default:
+ return std::nullopt;
+ }
+ }
+
+ /// Set the stride of a vector when executing AArch32 VFP instructions.
+ /// This field has no function in AArch64 state.
+ void Stride(size_t stride) {
+ ASSERT_MSG(stride >= 1 && stride <= 2, "FPCR: Invalid stride");
+ value = mcl::bit::set_bits<20, 21>(value, stride == 1 ? 0b00u : 0b11u);
+ }
+
+ /// Get flush-to-zero (half-precision specific) mode control bit.
+ bool FZ16() const {
+ return mcl::bit::get_bit<19>(value);
+ }
+
+ /// Set flush-to-zero (half-precision specific) mode control bit.
+ void FZ16(bool fz16) {
+ value = mcl::bit::set_bit<19>(value, fz16);
+ }
+
+ /// Gets the length of a vector when executing AArch32 VFP instructions.
+ /// This field has no function in AArch64 state.
+ size_t Len() const {
+ return mcl::bit::get_bits<16, 18>(value) + 1;
+ }
+
+ /// Sets the length of a vector when executing AArch32 VFP instructions.
+ /// This field has no function in AArch64 state.
+ void Len(size_t len) {
+ ASSERT_MSG(len >= 1 && len <= 8, "FPCR: Invalid len");
+ value = mcl::bit::set_bits<16, 18>(value, static_cast<u32>(len - 1));
+ }
+
+ /// Get input denormal exception trap enable flag.
+ bool IDE() const {
+ return mcl::bit::get_bit<15>(value);
+ }
+
+ /// Set input denormal exception trap enable flag.
+ void IDE(bool ide) {
+ value = mcl::bit::set_bit<15>(value, ide);
+ }
+
+ /// Get inexact exception trap enable flag.
+ bool IXE() const {
+ return mcl::bit::get_bit<12>(value);
+ }
+
+ /// Set inexact exception trap enable flag.
+ void IXE(bool ixe) {
+ value = mcl::bit::set_bit<12>(value, ixe);
+ }
+
+ /// Get underflow exception trap enable flag.
+ bool UFE() const {
+ return mcl::bit::get_bit<11>(value);
+ }
+
+ /// Set underflow exception trap enable flag.
+ void UFE(bool ufe) {
+ value = mcl::bit::set_bit<11>(value, ufe);
+ }
+
+ /// Get overflow exception trap enable flag.
+ bool OFE() const {
+ return mcl::bit::get_bit<10>(value);
+ }
+
+ /// Set overflow exception trap enable flag.
+ void OFE(bool ofe) {
+ value = mcl::bit::set_bit<10>(value, ofe);
+ }
+
+ /// Get division by zero exception trap enable flag.
+ bool DZE() const {
+ return mcl::bit::get_bit<9>(value);
+ }
+
+ /// Set division by zero exception trap enable flag.
+ void DZE(bool dze) {
+ value = mcl::bit::set_bit<9>(value, dze);
+ }
+
+ /// Get invalid operation exception trap enable flag.
+ bool IOE() const {
+ return mcl::bit::get_bit<8>(value);
+ }
+
+ /// Set invalid operation exception trap enable flag.
+ void IOE(bool ioe) {
+ value = mcl::bit::set_bit<8>(value, ioe);
+ }
+
+ /// Gets the underlying raw value within the FPCR.
+ u32 Value() const {
+ return value;
+ }
+
+ /// Gets the StandardFPSCRValue (A32 ASIMD).
+ FPCR ASIMDStandardValue() const {
+ FPCR stdvalue;
+ stdvalue.AHP(AHP());
+ stdvalue.FZ16(FZ16());
+ stdvalue.FZ(true);
+ stdvalue.DN(true);
+ return stdvalue;
+ }
+
+private:
+ // Bits 0-7, 13-14, and 27-31 are reserved.
+ static constexpr u32 mask = 0x07FF9F00;
+ u32 value = 0;
+};
+
+inline bool operator==(FPCR lhs, FPCR rhs) {
+ return lhs.Value() == rhs.Value();
+}
+
+inline bool operator!=(FPCR lhs, FPCR rhs) {
+ return !operator==(lhs, rhs);
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/fpsr.h b/externals/dynarmic/src/dynarmic/common/fp/fpsr.h
new file mode 100644
index 0000000000..fba58ebbb7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/fpsr.h
@@ -0,0 +1,160 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::FP {
+
+/**
+ * Representation of the Floating-Point Status Register.
+ */
+class FPSR final {
+public:
+ FPSR() = default;
+ FPSR(const FPSR&) = default;
+ FPSR(FPSR&&) = default;
+ explicit FPSR(u32 data)
+ : value{data & mask} {}
+
+ FPSR& operator=(const FPSR&) = default;
+ FPSR& operator=(FPSR&&) = default;
+ FPSR& operator=(u32 data) {
+ value = data & mask;
+ return *this;
+ }
+
+ /// Get negative condition flag
+ bool N() const {
+ return mcl::bit::get_bit<31>(value);
+ }
+
+ /// Set negative condition flag
+ void N(bool N_) {
+ value = mcl::bit::set_bit<31>(value, N_);
+ }
+
+ /// Get zero condition flag
+ bool Z() const {
+ return mcl::bit::get_bit<30>(value);
+ }
+
+ /// Set zero condition flag
+ void Z(bool Z_) {
+ value = mcl::bit::set_bit<30>(value, Z_);
+ }
+
+ /// Get carry condition flag
+ bool C() const {
+ return mcl::bit::get_bit<29>(value);
+ }
+
+ /// Set carry condition flag
+ void C(bool C_) {
+ value = mcl::bit::set_bit<29>(value, C_);
+ }
+
+ /// Get overflow condition flag
+ bool V() const {
+ return mcl::bit::get_bit<28>(value);
+ }
+
+ /// Set overflow condition flag
+ void V(bool V_) {
+ value = mcl::bit::set_bit<28>(value, V_);
+ }
+
+ /// Get cumulative saturation bit
+ bool QC() const {
+ return mcl::bit::get_bit<27>(value);
+ }
+
+ /// Set cumulative saturation bit
+ void QC(bool QC_) {
+ value = mcl::bit::set_bit<27>(value, QC_);
+ }
+
+ /// Get input denormal floating-point exception bit
+ bool IDC() const {
+ return mcl::bit::get_bit<7>(value);
+ }
+
+ /// Set input denormal floating-point exception bit
+ void IDC(bool IDC_) {
+ value = mcl::bit::set_bit<7>(value, IDC_);
+ }
+
+ /// Get inexact cumulative floating-point exception bit
+ bool IXC() const {
+ return mcl::bit::get_bit<4>(value);
+ }
+
+ /// Set inexact cumulative floating-point exception bit
+ void IXC(bool IXC_) {
+ value = mcl::bit::set_bit<4>(value, IXC_);
+ }
+
+ /// Get underflow cumulative floating-point exception bit
+ bool UFC() const {
+ return mcl::bit::get_bit<3>(value);
+ }
+
+ /// Set underflow cumulative floating-point exception bit
+ void UFC(bool UFC_) {
+ value = mcl::bit::set_bit<3>(value, UFC_);
+ }
+
+ /// Get overflow cumulative floating-point exception bit
+ bool OFC() const {
+ return mcl::bit::get_bit<2>(value);
+ }
+
+ /// Set overflow cumulative floating-point exception bit
+ void OFC(bool OFC_) {
+ value = mcl::bit::set_bit<2>(value, OFC_);
+ }
+
+ /// Get divide by zero cumulative floating-point exception bit
+ bool DZC() const {
+ return mcl::bit::get_bit<1>(value);
+ }
+
+ /// Set divide by zero cumulative floating-point exception bit
+ void DZC(bool DZC_) {
+ value = mcl::bit::set_bit<1>(value, DZC_);
+ }
+
+ /// Get invalid operation cumulative floating-point exception bit
+ bool IOC() const {
+ return mcl::bit::get_bit<0>(value);
+ }
+
+ /// Set invalid operation cumulative floating-point exception bit
+ void IOC(bool IOC_) {
+ value = mcl::bit::set_bit<0>(value, IOC_);
+ }
+
+ /// Gets the underlying raw value within the FPSR.
+ u32 Value() const {
+ return value;
+ }
+
+private:
+ // Bits 5-6 and 8-26 are reserved.
+ static constexpr u32 mask = 0xF800009F;
+ u32 value = 0;
+};
+
+inline bool operator==(FPSR lhs, FPSR rhs) {
+ return lhs.Value() == rhs.Value();
+}
+
+inline bool operator!=(FPSR lhs, FPSR rhs) {
+ return !operator==(lhs, rhs);
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/fused.cpp b/externals/dynarmic/src/dynarmic/common/fp/fused.cpp
new file mode 100644
index 0000000000..5c32b05eb4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/fused.cpp
@@ -0,0 +1,91 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/fused.h"
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/common/fp/unpacked.h"
+#include "dynarmic/common/u128.h"
+
+namespace Dynarmic::FP {
+
+constexpr size_t product_point_position = normalized_point_position * 2;
+
+static FPUnpacked ReduceMantissa(bool sign, int exponent, const u128& mantissa) {
+ constexpr int point_position_correction = normalized_point_position - (product_point_position - 64);
+ // We round-to-odd here when reducing the bitwidth of the mantissa so that subsequent roundings are accurate.
+ return {sign, exponent + point_position_correction, mantissa.upper | static_cast<u64>(mantissa.lower != 0)};
+}
+
+FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) {
+ const bool product_sign = op1.sign != op2.sign;
+ const auto [product_exponent, product_value] = [op1, op2] {
+ int exponent = op1.exponent + op2.exponent;
+ u128 value = Multiply64To128(op1.mantissa, op2.mantissa);
+ if (value.Bit<product_point_position + 1>()) {
+ value = value >> 1;
+ exponent++;
+ }
+ return std::make_tuple(exponent, value);
+ }();
+
+ if (product_value == 0) {
+ return addend;
+ }
+
+ if (addend.mantissa == 0) {
+ return ReduceMantissa(product_sign, product_exponent, product_value);
+ }
+
+ const int exp_diff = product_exponent - addend.exponent;
+
+ if (product_sign == addend.sign) {
+ // Addition
+
+ if (exp_diff <= 0) {
+ // addend > product
+ const u64 result = addend.mantissa + StickyLogicalShiftRight(product_value, normalized_point_position - exp_diff).lower;
+ return FPUnpacked{addend.sign, addend.exponent, result};
+ }
+
+ // addend < product
+ const u128 result = product_value + StickyLogicalShiftRight(addend.mantissa, exp_diff - normalized_point_position);
+ return ReduceMantissa(product_sign, product_exponent, result);
+ }
+
+ // Subtraction
+
+ const u128 addend_long = u128(addend.mantissa) << normalized_point_position;
+
+ bool result_sign;
+ u128 result;
+ int result_exponent;
+
+ if (exp_diff == 0 && product_value > addend_long) {
+ result_sign = product_sign;
+ result_exponent = product_exponent;
+ result = product_value - addend_long;
+ } else if (exp_diff <= 0) {
+ result_sign = !product_sign;
+ result_exponent = addend.exponent;
+ result = addend_long - StickyLogicalShiftRight(product_value, -exp_diff);
+ } else {
+ result_sign = product_sign;
+ result_exponent = product_exponent;
+ result = product_value - StickyLogicalShiftRight(addend_long, exp_diff);
+ }
+
+ if (result.upper == 0) {
+ return FPUnpacked{result_sign, result_exponent, result.lower};
+ }
+
+ const int required_shift = normalized_point_position - mcl::bit::highest_set_bit(result.upper);
+ result = result << required_shift;
+ result_exponent -= required_shift;
+ return ReduceMantissa(result_sign, result_exponent, result);
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/fused.h b/externals/dynarmic/src/dynarmic/common/fp/fused.h
new file mode 100644
index 0000000000..0ffa2683c1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/fused.h
@@ -0,0 +1,15 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+struct FPUnpacked;
+
+/// This function assumes all arguments have been normalized.
+FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/info.h b/externals/dynarmic/src/dynarmic/common/fp/info.h
new file mode 100644
index 0000000000..8cc2d29de4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/info.h
@@ -0,0 +1,138 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+struct FPInfo {};
+
+template<>
+struct FPInfo<u16> {
+ static constexpr size_t total_width = 16;
+ static constexpr size_t exponent_width = 5;
+ static constexpr size_t explicit_mantissa_width = 10;
+ static constexpr size_t mantissa_width = explicit_mantissa_width + 1;
+
+ static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width;
+ static constexpr u32 sign_mask = 0x8000;
+ static constexpr u32 exponent_mask = 0x7C00;
+ static constexpr u32 mantissa_mask = 0x3FF;
+ static constexpr u32 mantissa_msb = 0x200;
+
+ static constexpr int exponent_min = -14;
+ static constexpr int exponent_max = 15;
+ static constexpr int exponent_bias = 15;
+
+ static constexpr u16 Zero(bool sign) {
+ return sign ? static_cast<u16>(sign_mask) : u16{0};
+ }
+
+ static constexpr u16 Infinity(bool sign) {
+ return static_cast<u16>(exponent_mask | Zero(sign));
+ }
+
+ static constexpr u16 MaxNormal(bool sign) {
+ return static_cast<u16>((exponent_mask - 1) | Zero(sign));
+ }
+
+ static constexpr u16 DefaultNaN() {
+ return static_cast<u16>(exponent_mask | (u32(1) << (explicit_mantissa_width - 1)));
+ }
+};
+
+template<>
+struct FPInfo<u32> {
+ static constexpr size_t total_width = 32;
+ static constexpr size_t exponent_width = 8;
+ static constexpr size_t explicit_mantissa_width = 23;
+ static constexpr size_t mantissa_width = explicit_mantissa_width + 1;
+
+ static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width;
+ static constexpr u32 sign_mask = 0x80000000;
+ static constexpr u32 exponent_mask = 0x7F800000;
+ static constexpr u32 mantissa_mask = 0x007FFFFF;
+ static constexpr u32 mantissa_msb = 0x00400000;
+
+ static constexpr int exponent_min = -126;
+ static constexpr int exponent_max = 127;
+ static constexpr int exponent_bias = 127;
+
+ static constexpr u32 Zero(bool sign) {
+ return sign ? sign_mask : 0;
+ }
+
+ static constexpr u32 Infinity(bool sign) {
+ return exponent_mask | Zero(sign);
+ }
+
+ static constexpr u32 MaxNormal(bool sign) {
+ return (exponent_mask - 1) | Zero(sign);
+ }
+
+ static constexpr u32 DefaultNaN() {
+ return exponent_mask | (u32(1) << (explicit_mantissa_width - 1));
+ }
+};
+
+template<>
+struct FPInfo<u64> {
+ static constexpr size_t total_width = 64;
+ static constexpr size_t exponent_width = 11;
+ static constexpr size_t explicit_mantissa_width = 52;
+ static constexpr size_t mantissa_width = explicit_mantissa_width + 1;
+
+ static constexpr u64 implicit_leading_bit = u64(1) << explicit_mantissa_width;
+ static constexpr u64 sign_mask = 0x8000'0000'0000'0000;
+ static constexpr u64 exponent_mask = 0x7FF0'0000'0000'0000;
+ static constexpr u64 mantissa_mask = 0x000F'FFFF'FFFF'FFFF;
+ static constexpr u64 mantissa_msb = 0x0008'0000'0000'0000;
+
+ static constexpr int exponent_min = -1022;
+ static constexpr int exponent_max = 1023;
+ static constexpr int exponent_bias = 1023;
+
+ static constexpr u64 Zero(bool sign) {
+ return sign ? sign_mask : 0;
+ }
+
+ static constexpr u64 Infinity(bool sign) {
+ return exponent_mask | Zero(sign);
+ }
+
+ static constexpr u64 MaxNormal(bool sign) {
+ return (exponent_mask - 1) | Zero(sign);
+ }
+
+ static constexpr u64 DefaultNaN() {
+ return exponent_mask | (u64(1) << (explicit_mantissa_width - 1));
+ }
+};
+
+/// value = (sign ? -1 : +1) * 2^exponent * value
+/// @note We do not handle denormals. Denormals will static_assert.
+template<typename FPT, bool sign, int exponent, FPT value>
+constexpr FPT FPValue() {
+ if constexpr (value == 0) {
+ return FPInfo<FPT>::Zero(sign);
+ }
+
+ constexpr int point_position = static_cast<int>(FPInfo<FPT>::explicit_mantissa_width);
+ constexpr int highest_bit = mcl::bit::highest_set_bit(value);
+ constexpr int offset = point_position - highest_bit;
+ constexpr int normalized_exponent = exponent - offset + point_position;
+ static_assert(offset >= 0);
+ static_assert(normalized_exponent >= FPInfo<FPT>::exponent_min && normalized_exponent <= FPInfo<FPT>::exponent_max);
+
+ constexpr FPT mantissa = (value << offset) & FPInfo<FPT>::mantissa_mask;
+ constexpr FPT biased_exponent = static_cast<FPT>(normalized_exponent + FPInfo<FPT>::exponent_bias);
+ return FPT(FPInfo<FPT>::Zero(sign) | mantissa | (biased_exponent << FPInfo<FPT>::explicit_mantissa_width));
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h b/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h
new file mode 100644
index 0000000000..31be52f761
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h
@@ -0,0 +1,47 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::FP {
+
+enum class ResidualError {
+ Zero,
+ LessThanHalf,
+ Half,
+ GreaterThanHalf,
+};
+
+inline ResidualError ResidualErrorOnRightShift(u64 mantissa, int shift_amount) {
+ if (shift_amount <= 0 || mantissa == 0) {
+ return ResidualError::Zero;
+ }
+
+ if (shift_amount > static_cast<int>(mcl::bitsizeof<u64>)) {
+ return mcl::bit::most_significant_bit(mantissa) ? ResidualError::GreaterThanHalf : ResidualError::LessThanHalf;
+ }
+
+ const size_t half_bit_position = static_cast<size_t>(shift_amount - 1);
+ const u64 half = static_cast<u64>(1) << half_bit_position;
+ const u64 error_mask = mcl::bit::ones<u64>(static_cast<size_t>(shift_amount));
+ const u64 error = mantissa & error_mask;
+
+ if (error == 0) {
+ return ResidualError::Zero;
+ }
+ if (error < half) {
+ return ResidualError::LessThanHalf;
+ }
+ if (error == half) {
+ return ResidualError::Half;
+ }
+ return ResidualError::GreaterThanHalf;
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op.h b/externals/dynarmic/src/dynarmic/common/fp/op.h
new file mode 100644
index 0000000000..b30ff39c44
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op.h
@@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/common/fp/op/FPCompare.h"
+#include "dynarmic/common/fp/op/FPConvert.h"
+#include "dynarmic/common/fp/op/FPMulAdd.h"
+#include "dynarmic/common/fp/op/FPRSqrtEstimate.h"
+#include "dynarmic/common/fp/op/FPRSqrtStepFused.h"
+#include "dynarmic/common/fp/op/FPRecipEstimate.h"
+#include "dynarmic/common/fp/op/FPRecipExponent.h"
+#include "dynarmic/common/fp/op/FPRecipStepFused.h"
+#include "dynarmic/common/fp/op/FPRoundInt.h"
+#include "dynarmic/common/fp/op/FPToFixed.h"
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp
new file mode 100644
index 0000000000..8aeaadc573
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp
@@ -0,0 +1,40 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPCompare.h"
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+bool FPCompareEQ(FPT lhs, FPT rhs, FPCR fpcr, FPSR& fpsr) {
+ const auto unpacked1 = FPUnpack(lhs, fpcr, fpsr);
+ const auto unpacked2 = FPUnpack(rhs, fpcr, fpsr);
+ const auto type1 = std::get<FPType>(unpacked1);
+ const auto type2 = std::get<FPType>(unpacked2);
+ const auto& value1 = std::get<FPUnpacked>(unpacked1);
+ const auto& value2 = std::get<FPUnpacked>(unpacked2);
+
+ if (type1 == FPType::QNaN || type1 == FPType::SNaN || type2 == FPType::QNaN || type2 == FPType::SNaN) {
+ if (type1 == FPType::SNaN || type2 == FPType::SNaN) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ }
+
+ // Comparisons against NaN are never equal.
+ return false;
+ }
+
+ return value1 == value2 || (type1 == FPType::Zero && type2 == FPType::Zero);
+}
+
+template bool FPCompareEQ<u16>(u16 lhs, u16 rhs, FPCR fpcr, FPSR& fpsr);
+template bool FPCompareEQ<u32>(u32 lhs, u32 rhs, FPCR fpcr, FPSR& fpsr);
+template bool FPCompareEQ<u64>(u64 lhs, u64 rhs, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h
new file mode 100644
index 0000000000..e0d5ca0374
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+bool FPCompareEQ(FPT lhs, FPT rhs, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp
new file mode 100644
index 0000000000..85f1eaf899
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp
@@ -0,0 +1,93 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPConvert.h"
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+namespace {
+template<typename FPT_TO, typename FPT_FROM>
+FPT_TO FPConvertNaN(FPT_FROM op) {
+ const bool sign = mcl::bit::get_bit<mcl::bitsizeof<FPT_FROM> - 1>(op);
+ const u64 frac = [op] {
+ if constexpr (sizeof(FPT_FROM) == sizeof(u64)) {
+ return mcl::bit::get_bits<0, 50>(op);
+ } else if constexpr (sizeof(FPT_FROM) == sizeof(u32)) {
+ return u64{mcl::bit::get_bits<0, 21>(op)} << 29;
+ } else {
+ return u64{mcl::bit::get_bits<0, 8>(op)} << 42;
+ }
+ }();
+
+ const size_t dest_bit_size = mcl::bitsizeof<FPT_TO>;
+ const u64 shifted_sign = u64{sign} << (dest_bit_size - 1);
+ const u64 exponent = mcl::bit::ones<u64>(dest_bit_size - FPInfo<FPT_TO>::explicit_mantissa_width);
+
+ if constexpr (sizeof(FPT_TO) == sizeof(u64)) {
+ return FPT_TO(shifted_sign | exponent << 51 | frac);
+ } else if constexpr (sizeof(FPT_TO) == sizeof(u32)) {
+ return FPT_TO(shifted_sign | exponent << 22 | mcl::bit::get_bits<29, 50>(frac));
+ } else {
+ return FPT_TO(shifted_sign | exponent << 9 | mcl::bit::get_bits<42, 50>(frac));
+ }
+}
+} // Anonymous namespace
+
+template<typename FPT_TO, typename FPT_FROM>
+FPT_TO FPConvert(FPT_FROM op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr) {
+ const auto [type, sign, value] = FPUnpackCV<FPT_FROM>(op, fpcr, fpsr);
+ const bool is_althp = mcl::bitsizeof<FPT_TO> == 16 && fpcr.AHP();
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ std::uintmax_t result{};
+
+ if (is_althp) {
+ result = FPInfo<FPT_TO>::Zero(sign);
+ } else if (fpcr.DN()) {
+ result = FPInfo<FPT_TO>::DefaultNaN();
+ } else {
+ result = FPConvertNaN<FPT_TO>(op);
+ }
+
+ if (type == FPType::SNaN || is_althp) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ }
+
+ return FPT_TO(result);
+ }
+
+ if (type == FPType::Infinity) {
+ if (is_althp) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return FPT_TO(u32{sign} << 15 | 0b111111111111111);
+ }
+
+ return FPInfo<FPT_TO>::Infinity(sign);
+ }
+
+ if (type == FPType::Zero) {
+ return FPInfo<FPT_TO>::Zero(sign);
+ }
+
+ return FPRoundCV<FPT_TO>(value, fpcr, rounding_mode, fpsr);
+}
+
+template u16 FPConvert<u16, u32>(u32 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+template u16 FPConvert<u16, u64>(u64 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+template u32 FPConvert<u32, u16>(u16 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+template u32 FPConvert<u32, u64>(u64 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+template u64 FPConvert<u64, u16>(u16 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+template u64 FPConvert<u64, u32>(u32 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h
new file mode 100644
index 0000000000..0fea22e482
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h
@@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+enum class RoundingMode;
+
+template<typename FPT_TO, typename FPT_FROM>
+FPT_TO FPConvert(FPT_FROM op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp
new file mode 100644
index 0000000000..f97f88de17
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp
@@ -0,0 +1,90 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPMulAdd.h"
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/fused.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+ const RoundingMode rounding = fpcr.RMode();
+
+ const auto [typeA, signA, valueA] = FPUnpack(addend, fpcr, fpsr);
+ const auto [type1, sign1, value1] = FPUnpack(op1, fpcr, fpsr);
+ const auto [type2, sign2, value2] = FPUnpack(op2, fpcr, fpsr);
+
+ const bool infA = typeA == FPType::Infinity;
+ const bool inf1 = type1 == FPType::Infinity;
+ const bool inf2 = type2 == FPType::Infinity;
+ const bool zeroA = typeA == FPType::Zero;
+ const bool zero1 = type1 == FPType::Zero;
+ const bool zero2 = type2 == FPType::Zero;
+
+ const auto maybe_nan = FPProcessNaNs3<FPT>(typeA, type1, type2, addend, op1, op2, fpcr, fpsr);
+
+ if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return FPInfo<FPT>::DefaultNaN();
+ }
+
+ if (maybe_nan) {
+ return *maybe_nan;
+ }
+
+ // Calculate properties of product (op1 * op2).
+ const bool signP = sign1 != sign2;
+ const bool infP = inf1 || inf2;
+ const bool zeroP = zero1 || zero2;
+
+ // Raise NaN on (inf * inf) of opposite signs or (inf * zero).
+ if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return FPInfo<FPT>::DefaultNaN();
+ }
+
+ // Handle infinities
+ if ((infA && !signA) || (infP && !signP)) {
+ return FPInfo<FPT>::Infinity(false);
+ }
+ if ((infA && signA) || (infP && signP)) {
+ return FPInfo<FPT>::Infinity(true);
+ }
+
+ // Result is exactly zero
+ if (zeroA && zeroP && signA == signP) {
+ return FPInfo<FPT>::Zero(signA);
+ }
+
+ const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2);
+ if (result_value.mantissa == 0) {
+ return FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity);
+ }
+ return FPRound<FPT>(result_value, fpcr, fpsr);
+}
+
+template u16 FPMulAdd<u16>(u16 addend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
+template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+ return FPMulAdd<FPT>(minuend, (op1 ^ FPInfo<FPT>::sign_mask), op2, fpcr, fpsr);
+}
+
+template u16 FPMulSub<u16>(u16 minuend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
+template u32 FPMulSub<u32>(u32 minuend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template u64 FPMulSub<u64>(u64 minuend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h
new file mode 100644
index 0000000000..8e0a16cda5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h
@@ -0,0 +1,19 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h
new file mode 100644
index 0000000000..4f172a598e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h
@@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include "dynarmic/common/fp/info.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+constexpr FPT FPNeg(FPT op) {
+ return op ^ FPInfo<FPT>::sign_mask;
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp
new file mode 100644
index 0000000000..ed28bcc5cc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp
@@ -0,0 +1,59 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRSqrtEstimate.h"
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+#include "dynarmic/common/math_util.h"
+#include "dynarmic/common/safe_ops.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
+ const auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr);
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ return FPProcessNaN(type, op, fpcr, fpsr);
+ }
+
+ if (type == FPType::Zero) {
+ FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
+ return FPInfo<FPT>::Infinity(sign);
+ }
+
+ if (sign) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return FPInfo<FPT>::DefaultNaN();
+ }
+
+ if (type == FPType::Infinity) {
+ // Note: Just +Inf reaches here, negatives are handled in the case above.
+ return FPInfo<FPT>::Zero(false);
+ }
+
+ const int result_exponent = (-(value.exponent + 1)) >> 1;
+ const bool was_exponent_odd = (value.exponent) % 2 == 0;
+
+ const u64 scaled = Safe::LogicalShiftRight(value.mantissa, normalized_point_position - (was_exponent_odd ? 7 : 8));
+ const u64 estimate = Common::RecipSqrtEstimate(scaled);
+
+ const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias);
+ const FPT bits_mantissa = static_cast<FPT>(estimate << (FPInfo<FPT>::explicit_mantissa_width - 8));
+ return (bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask);
+}
+
+template u16 FPRSqrtEstimate<u16>(u16 op, FPCR fpcr, FPSR& fpsr);
+template u32 FPRSqrtEstimate<u32>(u32 op, FPCR fpcr, FPSR& fpsr);
+template u64 FPRSqrtEstimate<u64>(u64 op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h
new file mode 100644
index 0000000000..f51be1e8bc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp
new file mode 100644
index 0000000000..06fe96d44b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp
@@ -0,0 +1,57 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRSqrtStepFused.h"
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/fused.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/op/FPNeg.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+ op1 = FPNeg(op1);
+
+ const auto [type1, sign1, value1] = FPUnpack(op1, fpcr, fpsr);
+ const auto [type2, sign2, value2] = FPUnpack(op2, fpcr, fpsr);
+
+ if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) {
+ return *maybe_nan;
+ }
+
+ const bool inf1 = type1 == FPType::Infinity;
+ const bool inf2 = type2 == FPType::Infinity;
+ const bool zero1 = type1 == FPType::Zero;
+ const bool zero2 = type2 == FPType::Zero;
+
+ if ((inf1 && zero2) || (zero1 && inf2)) {
+ // return +1.5
+ return FPValue<FPT, false, -1, 3>();
+ }
+
+ if (inf1 || inf2) {
+ return FPInfo<FPT>::Infinity(sign1 != sign2);
+ }
+
+ // result_value = (3.0 + (value1 * value2)) / 2.0
+ FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 3), value1, value2);
+ result_value.exponent--;
+
+ if (result_value.mantissa == 0) {
+ return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
+ }
+ return FPRound<FPT>(result_value, fpcr, fpsr);
+}
+
+template u16 FPRSqrtStepFused<u16>(u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
+template u32 FPRSqrtStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template u64 FPRSqrtStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h
new file mode 100644
index 0000000000..384a75924d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp
new file mode 100644
index 0000000000..1fd4b924b2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp
@@ -0,0 +1,100 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRecipEstimate.h"
+
+#include <tuple>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+#include "dynarmic/common/math_util.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr) {
+ FPType type;
+ bool sign;
+ FPUnpacked value;
+ std::tie(type, sign, value) = FPUnpack<FPT>(op, fpcr, fpsr);
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ return FPProcessNaN(type, op, fpcr, fpsr);
+ }
+
+ if (type == FPType::Infinity) {
+ return FPInfo<FPT>::Zero(sign);
+ }
+
+ if (type == FPType::Zero) {
+ FPProcessException(FPExc::DivideByZero, fpcr, fpsr);
+ return FPInfo<FPT>::Infinity(sign);
+ }
+
+ if (value.exponent < FPInfo<FPT>::exponent_min - 2) {
+ const bool overflow_to_inf = [&] {
+ switch (fpcr.RMode()) {
+ case RoundingMode::ToNearest_TieEven:
+ return true;
+ case RoundingMode::TowardsPlusInfinity:
+ return !sign;
+ case RoundingMode::TowardsMinusInfinity:
+ return sign;
+ case RoundingMode::TowardsZero:
+ return false;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ FPProcessException(FPExc::Overflow, fpcr, fpsr);
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ return overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign);
+ }
+
+ if ((fpcr.FZ() && !std::is_same_v<FPT, u16>) || (fpcr.FZ16() && std::is_same_v<FPT, u16>)) {
+ if (value.exponent >= -FPInfo<FPT>::exponent_min) {
+ fpsr.UFC(true);
+ return FPInfo<FPT>::Zero(sign);
+ }
+ }
+
+ const u64 scaled = value.mantissa >> (normalized_point_position - 8);
+ u64 estimate = static_cast<u64>(Common::RecipEstimate(scaled)) << (FPInfo<FPT>::explicit_mantissa_width - 8);
+ int result_exponent = -(value.exponent + 1);
+ if (result_exponent < FPInfo<FPT>::exponent_min) {
+ switch (result_exponent) {
+ case (FPInfo<FPT>::exponent_min - 1):
+ estimate |= FPInfo<FPT>::implicit_leading_bit;
+ estimate >>= 1;
+ break;
+ case (FPInfo<FPT>::exponent_min - 2):
+ estimate |= FPInfo<FPT>::implicit_leading_bit;
+ estimate >>= 2;
+ result_exponent++;
+ break;
+ default:
+ UNREACHABLE();
+ }
+ }
+
+ const FPT bits_sign = FPInfo<FPT>::Zero(sign);
+ const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias);
+ const FPT bits_mantissa = static_cast<FPT>(estimate);
+ return FPT((bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask) | bits_sign);
+}
+
+template u16 FPRecipEstimate<u16>(u16 op, FPCR fpcr, FPSR& fpsr);
+template u32 FPRecipEstimate<u32>(u32 op, FPCR fpcr, FPSR& fpsr);
+template u64 FPRecipEstimate<u64>(u64 op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h
new file mode 100644
index 0000000000..eaf6fee5bf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp
new file mode 100644
index 0000000000..171b94e96e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp
@@ -0,0 +1,59 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRecipExponent.h"
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+namespace {
+template<typename FPT>
+FPT DetermineExponentValue(size_t value) {
+ if constexpr (sizeof(FPT) == sizeof(u32)) {
+ return static_cast<FPT>(mcl::bit::get_bits<23, 30>(value));
+ } else if constexpr (sizeof(FPT) == sizeof(u64)) {
+ return static_cast<FPT>(mcl::bit::get_bits<52, 62>(value));
+ } else {
+ return static_cast<FPT>(mcl::bit::get_bits<10, 14>(value));
+ }
+}
+} // Anonymous namespace
+
+template<typename FPT>
+FPT FPRecipExponent(FPT op, FPCR fpcr, FPSR& fpsr) {
+ const auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr);
+ (void)value;
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ return FPProcessNaN(type, op, fpcr, fpsr);
+ }
+
+ const FPT sign_bits = FPInfo<FPT>::Zero(sign);
+ const FPT exponent = DetermineExponentValue<FPT>(op);
+
+ // Zero and denormals
+ if (exponent == 0) {
+ const FPT max_exponent = mcl::bit::ones<FPT>(FPInfo<FPT>::exponent_width) - 1;
+ return FPT(sign_bits | (max_exponent << FPInfo<FPT>::explicit_mantissa_width));
+ }
+
+ // Infinities and normals
+ const FPT negated_exponent = FPT(~exponent);
+ const FPT adjusted_exponent = FPT(negated_exponent << FPInfo<FPT>::explicit_mantissa_width) & FPInfo<FPT>::exponent_mask;
+ return FPT(sign_bits | adjusted_exponent);
+}
+
+template u16 FPRecipExponent<u16>(u16 op, FPCR fpcr, FPSR& fpsr);
+template u32 FPRecipExponent<u32>(u32 op, FPCR fpcr, FPSR& fpsr);
+template u64 FPRecipExponent<u64>(u64 op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h
new file mode 100644
index 0000000000..63ff3530b2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRecipExponent(FPT op, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp
new file mode 100644
index 0000000000..b01477feca
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp
@@ -0,0 +1,56 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRecipStepFused.h"
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/fused.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/op/FPNeg.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+ op1 = FPNeg(op1);
+
+ const auto [type1, sign1, value1] = FPUnpack<FPT>(op1, fpcr, fpsr);
+ const auto [type2, sign2, value2] = FPUnpack<FPT>(op2, fpcr, fpsr);
+
+ if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) {
+ return *maybe_nan;
+ }
+
+ const bool inf1 = type1 == FPType::Infinity;
+ const bool inf2 = type2 == FPType::Infinity;
+ const bool zero1 = type1 == FPType::Zero;
+ const bool zero2 = type2 == FPType::Zero;
+
+ if ((inf1 && zero2) || (zero1 && inf2)) {
+ // return +2.0
+ return FPValue<FPT, false, 0, 2>();
+ }
+
+ if (inf1 || inf2) {
+ return FPInfo<FPT>::Infinity(sign1 != sign2);
+ }
+
+ // result_value = 2.0 + (value1 * value2)
+ const FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2);
+
+ if (result_value.mantissa == 0) {
+ return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity);
+ }
+ return FPRound<FPT>(result_value, fpcr, fpsr);
+}
+
+template u16 FPRecipStepFused<u16>(u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
+template u32 FPRecipStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template u64 FPRecipStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h
new file mode 100644
index 0000000000..abe447b500
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+template<typename FPT>
+FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp
new file mode 100644
index 0000000000..7325909075
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp
@@ -0,0 +1,97 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPRoundInt.h"
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/mantissa_util.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/process_nan.h"
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/common/fp/unpacked.h"
+#include "dynarmic/common/safe_ops.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr) {
+ ASSERT(rounding != RoundingMode::ToOdd);
+
+ auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr);
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ return FPProcessNaN(type, op, fpcr, fpsr);
+ }
+
+ if (type == FPType::Infinity) {
+ return FPInfo<FPT>::Infinity(sign);
+ }
+
+ if (type == FPType::Zero) {
+ return FPInfo<FPT>::Zero(sign);
+ }
+
+ // Reshift decimal point back to bit zero.
+ const int exponent = value.exponent - normalized_point_position;
+
+ if (exponent >= 0) {
+ // Guaranteed to be an integer
+ return op;
+ }
+
+ u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa);
+ const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent);
+ int_result = Safe::ArithmeticShiftLeft(int_result, exponent);
+
+ bool round_up = false;
+ switch (rounding) {
+ case RoundingMode::ToNearest_TieEven:
+ round_up = error > ResidualError::Half || (error == ResidualError::Half && mcl::bit::get_bit<0>(int_result));
+ break;
+ case RoundingMode::TowardsPlusInfinity:
+ round_up = error != ResidualError::Zero;
+ break;
+ case RoundingMode::TowardsMinusInfinity:
+ round_up = false;
+ break;
+ case RoundingMode::TowardsZero:
+ round_up = error != ResidualError::Zero && mcl::bit::most_significant_bit(int_result);
+ break;
+ case RoundingMode::ToNearest_TieAwayFromZero:
+ round_up = error > ResidualError::Half || (error == ResidualError::Half && !mcl::bit::most_significant_bit(int_result));
+ break;
+ case RoundingMode::ToOdd:
+ UNREACHABLE();
+ }
+
+ if (round_up) {
+ int_result++;
+ }
+
+ const bool new_sign = mcl::bit::most_significant_bit(int_result);
+ const u64 abs_int_result = new_sign ? Safe::Negate<u64>(int_result) : static_cast<u64>(int_result);
+
+ const FPT result = int_result == 0
+ ? FPInfo<FPT>::Zero(sign)
+ : FPRound<FPT>(FPUnpacked{new_sign, normalized_point_position, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr);
+
+ if (error != ResidualError::Zero && exact) {
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ }
+
+ return result;
+}
+
+template u64 FPRoundInt<u16>(u16 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr);
+template u64 FPRoundInt<u32>(u32 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr);
+template u64 FPRoundInt<u64>(u64 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h
new file mode 100644
index 0000000000..e326627ce1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h
@@ -0,0 +1,19 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+enum class RoundingMode;
+
+template<typename FPT>
+u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp
new file mode 100644
index 0000000000..b1a57d3104
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp
@@ -0,0 +1,105 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/op/FPToFixed.h"
+
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/mantissa_util.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/common/fp/unpacked.h"
+#include "dynarmic/common/safe_ops.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
+ ASSERT(rounding != RoundingMode::ToOdd);
+ ASSERT(ibits <= 64);
+ ASSERT(fbits <= ibits);
+
+ auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr);
+
+ if (type == FPType::SNaN || type == FPType::QNaN) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ }
+
+ // Handle zero
+ if (value.mantissa == 0) {
+ return 0;
+ }
+
+ if (sign && unsigned_) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return 0;
+ }
+
+ // value *= 2.0^fbits and reshift the decimal point back to bit zero.
+ int exponent = value.exponent + static_cast<int>(fbits) - normalized_point_position;
+
+ u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa);
+ const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent);
+ int_result = Safe::ArithmeticShiftLeft(int_result, exponent);
+
+ bool round_up = false;
+ switch (rounding) {
+ case RoundingMode::ToNearest_TieEven:
+ round_up = error > ResidualError::Half || (error == ResidualError::Half && mcl::bit::get_bit<0>(int_result));
+ break;
+ case RoundingMode::TowardsPlusInfinity:
+ round_up = error != ResidualError::Zero;
+ break;
+ case RoundingMode::TowardsMinusInfinity:
+ round_up = false;
+ break;
+ case RoundingMode::TowardsZero:
+ round_up = error != ResidualError::Zero && mcl::bit::most_significant_bit(int_result);
+ break;
+ case RoundingMode::ToNearest_TieAwayFromZero:
+ round_up = error > ResidualError::Half || (error == ResidualError::Half && !mcl::bit::most_significant_bit(int_result));
+ break;
+ case RoundingMode::ToOdd:
+ UNREACHABLE();
+ }
+
+ if (round_up) {
+ int_result++;
+ }
+
+ // Detect Overflow
+ const int min_exponent_for_overflow = static_cast<int>(ibits) - static_cast<int>(mcl::bit::highest_set_bit(value.mantissa + (round_up ? Safe::LogicalShiftRight<u64>(1, exponent) : 0))) - (unsigned_ ? 0 : 1);
+ if (exponent >= min_exponent_for_overflow) {
+ // Positive overflow
+ if (unsigned_ || !sign) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return mcl::bit::ones<u64>(ibits - (unsigned_ ? 0 : 1));
+ }
+
+ // Negative overflow
+ const u64 min_value = Safe::Negate<u64>(static_cast<u64>(1) << (ibits - 1));
+ if (!(exponent == min_exponent_for_overflow && int_result == min_value)) {
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ return static_cast<u64>(1) << (ibits - 1);
+ }
+ }
+
+ if (error != ResidualError::Zero) {
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ }
+ return int_result & mcl::bit::ones<u64>(ibits);
+}
+
+template u64 FPToFixed<u16>(size_t ibits, u16 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+template u64 FPToFixed<u32>(size_t ibits, u32 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+template u64 FPToFixed<u64>(size_t ibits, u64 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h
new file mode 100644
index 0000000000..53a952836e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h
@@ -0,0 +1,19 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+enum class RoundingMode;
+
+template<typename FPT>
+u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp b/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp
new file mode 100644
index 0000000000..a97e0ec820
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp
@@ -0,0 +1,59 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/process_exception.h"
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+
+namespace Dynarmic::FP {
+
+void FPProcessException(FPExc exception, FPCR fpcr, FPSR& fpsr) {
+ switch (exception) {
+ case FPExc::InvalidOp:
+ if (fpcr.IOE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.IOC(true);
+ break;
+ case FPExc::DivideByZero:
+ if (fpcr.DZE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.DZC(true);
+ break;
+ case FPExc::Overflow:
+ if (fpcr.OFE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.OFC(true);
+ break;
+ case FPExc::Underflow:
+ if (fpcr.UFE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.UFC(true);
+ break;
+ case FPExc::Inexact:
+ if (fpcr.IXE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.IXC(true);
+ break;
+ case FPExc::InputDenorm:
+ if (fpcr.IDE()) {
+ ASSERT_FALSE("Raising floating point exceptions unimplemented");
+ }
+ fpsr.IDC(true);
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_exception.h b/externals/dynarmic/src/dynarmic/common/fp/process_exception.h
new file mode 100644
index 0000000000..436a9daa28
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/process_exception.h
@@ -0,0 +1,24 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+
+enum class FPExc {
+ InvalidOp,
+ DivideByZero,
+ Overflow,
+ Underflow,
+ Inexact,
+ InputDenorm,
+};
+
+void FPProcessException(FPExc exception, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp b/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp
new file mode 100644
index 0000000000..516b92adb6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp
@@ -0,0 +1,93 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/process_nan.h"
+
+#include <optional>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/unpacked.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+FPT FPProcessNaN(FPType type, FPT op, FPCR fpcr, FPSR& fpsr) {
+ ASSERT(type == FPType::QNaN || type == FPType::SNaN);
+
+ constexpr size_t topfrac = FPInfo<FPT>::explicit_mantissa_width - 1;
+
+ FPT result = op;
+
+ if (type == FPType::SNaN) {
+ result = mcl::bit::set_bit<topfrac>(op, true);
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ }
+
+ if (fpcr.DN()) {
+ result = FPInfo<FPT>::DefaultNaN();
+ }
+
+ return result;
+}
+
+template u16 FPProcessNaN<u16>(FPType type, u16 op, FPCR fpcr, FPSR& fpsr);
+template u32 FPProcessNaN<u32>(FPType type, u32 op, FPCR fpcr, FPSR& fpsr);
+template u64 FPProcessNaN<u64>(FPType type, u64 op, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+std::optional<FPT> FPProcessNaNs(FPType type1, FPType type2, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) {
+ if (type1 == FPType::SNaN) {
+ return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr);
+ }
+ if (type2 == FPType::SNaN) {
+ return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr);
+ }
+ if (type1 == FPType::QNaN) {
+ return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr);
+ }
+ if (type2 == FPType::QNaN) {
+ return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr);
+ }
+ return std::nullopt;
+}
+
+template std::optional<u16> FPProcessNaNs<u16>(FPType type1, FPType type2, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr);
+template std::optional<u32> FPProcessNaNs<u32>(FPType type1, FPType type2, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr);
+template std::optional<u64> FPProcessNaNs<u64>(FPType type1, FPType type2, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+std::optional<FPT> FPProcessNaNs3(FPType type1, FPType type2, FPType type3, FPT op1, FPT op2, FPT op3, FPCR fpcr, FPSR& fpsr) {
+ if (type1 == FPType::SNaN) {
+ return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr);
+ }
+ if (type2 == FPType::SNaN) {
+ return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr);
+ }
+ if (type3 == FPType::SNaN) {
+ return FPProcessNaN<FPT>(type3, op3, fpcr, fpsr);
+ }
+ if (type1 == FPType::QNaN) {
+ return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr);
+ }
+ if (type2 == FPType::QNaN) {
+ return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr);
+ }
+ if (type3 == FPType::QNaN) {
+ return FPProcessNaN<FPT>(type3, op3, fpcr, fpsr);
+ }
+ return std::nullopt;
+}
+
+template std::optional<u16> FPProcessNaNs3<u16>(FPType type1, FPType type2, FPType type3, u16 op1, u16 op2, u16 op3, FPCR fpcr, FPSR& fpsr);
+template std::optional<u32> FPProcessNaNs3<u32>(FPType type1, FPType type2, FPType type3, u32 op1, u32 op2, u32 op3, FPCR fpcr, FPSR& fpsr);
+template std::optional<u64> FPProcessNaNs3<u64>(FPType type1, FPType type2, FPType type3, u64 op1, u64 op2, u64 op3, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_nan.h b/externals/dynarmic/src/dynarmic/common/fp/process_nan.h
new file mode 100644
index 0000000000..331b7943fc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/process_nan.h
@@ -0,0 +1,25 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+namespace Dynarmic::FP {
+
+class FPCR;
+class FPSR;
+enum class FPType;
+
+template<typename FPT>
+FPT FPProcessNaN(FPType type, FPT op, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+std::optional<FPT> FPProcessNaNs(FPType type1, FPType type2, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+std::optional<FPT> FPProcessNaNs3(FPType type1, FPType type2, FPType type3, FPT op1, FPT op2, FPT op3, FPCR fpcr, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h b/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h
new file mode 100644
index 0000000000..fcf36f6100
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h
@@ -0,0 +1,27 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::FP {
+
+/// Ordering of first four values is important as they correspond to bits in FPCR.
+enum class RoundingMode {
+ /// Round to nearest floating point. If there is a tie, round to nearest even digit in required position.
+ ToNearest_TieEven,
+ /// Round up towards positive infinity.
+ TowardsPlusInfinity,
+ /// Round downwards towards negative infinity.
+ TowardsMinusInfinity,
+ /// Truncate towards zero.
+ TowardsZero,
+ /// Round to nearest floating point. If there is a tie, round away from zero.
+ ToNearest_TieAwayFromZero,
+ /// Von Neumann rounding (as modified by Brent). Also known as sticky rounding.
+ /// Set the least significant bit to 1 if the result is not exact.
+ ToOdd,
+};
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp b/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp
new file mode 100644
index 0000000000..f853ab07e1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp
@@ -0,0 +1,197 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/unpacked.h"
+
+#include <algorithm>
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/common/fp/fpsr.h"
+#include "dynarmic/common/fp/info.h"
+#include "dynarmic/common/fp/mantissa_util.h"
+#include "dynarmic/common/fp/process_exception.h"
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/common/safe_ops.h"
+
+namespace Dynarmic::FP {
+
+template<typename FPT>
+std::tuple<FPType, bool, FPUnpacked> FPUnpackBase(FPT op, FPCR fpcr, [[maybe_unused]] FPSR& fpsr) {
+ constexpr size_t sign_bit = FPInfo<FPT>::exponent_width + FPInfo<FPT>::explicit_mantissa_width;
+ constexpr size_t exponent_high_bit = FPInfo<FPT>::exponent_width + FPInfo<FPT>::explicit_mantissa_width - 1;
+ constexpr size_t exponent_low_bit = FPInfo<FPT>::explicit_mantissa_width;
+ constexpr size_t mantissa_high_bit = FPInfo<FPT>::explicit_mantissa_width - 1;
+ constexpr size_t mantissa_low_bit = 0;
+ constexpr int denormal_exponent = FPInfo<FPT>::exponent_min - int(FPInfo<FPT>::explicit_mantissa_width);
+
+ constexpr bool is_half_precision = std::is_same_v<FPT, u16>;
+ const bool sign = mcl::bit::get_bit<sign_bit>(op);
+ const FPT exp_raw = mcl::bit::get_bits<exponent_low_bit, exponent_high_bit>(op);
+ const FPT frac_raw = mcl::bit::get_bits<mantissa_low_bit, mantissa_high_bit>(op);
+
+ if (exp_raw == 0) {
+ if constexpr (is_half_precision) {
+ if (frac_raw == 0 || fpcr.FZ16()) {
+ return {FPType::Zero, sign, {sign, 0, 0}};
+ }
+ return {FPType::Nonzero, sign, ToNormalized(sign, denormal_exponent, frac_raw)};
+ } else {
+ if (frac_raw == 0 || fpcr.FZ()) {
+ if (frac_raw != 0) {
+ FPProcessException(FPExc::InputDenorm, fpcr, fpsr);
+ }
+ return {FPType::Zero, sign, {sign, 0, 0}};
+ }
+
+ return {FPType::Nonzero, sign, ToNormalized(sign, denormal_exponent, frac_raw)};
+ }
+ }
+
+ const bool exp_all_ones = exp_raw == mcl::bit::ones<FPT>(FPInfo<FPT>::exponent_width);
+ const bool ahp_disabled = is_half_precision && !fpcr.AHP();
+ if ((exp_all_ones && !is_half_precision) || (exp_all_ones && ahp_disabled)) {
+ if (frac_raw == 0) {
+ return {FPType::Infinity, sign, ToNormalized(sign, 1000000, 1)};
+ }
+
+ const bool is_quiet = mcl::bit::get_bit<mantissa_high_bit>(frac_raw);
+ return {is_quiet ? FPType::QNaN : FPType::SNaN, sign, {sign, 0, 0}};
+ }
+
+ const int exp = static_cast<int>(exp_raw) - FPInfo<FPT>::exponent_bias;
+ const u64 frac = static_cast<u64>(frac_raw | FPInfo<FPT>::implicit_leading_bit) << (normalized_point_position - FPInfo<FPT>::explicit_mantissa_width);
+ return {FPType::Nonzero, sign, {sign, exp, frac}};
+}
+
+template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u16>(u16 op, FPCR fpcr, FPSR& fpsr);
+template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u32>(u32 op, FPCR fpcr, FPSR& fpsr);
+template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u64>(u64 op, FPCR fpcr, FPSR& fpsr);
+
+template<size_t F>
+std::tuple<bool, int, u64, ResidualError> Normalize(FPUnpacked op, int extra_right_shift = 0) {
+ const int highest_set_bit = mcl::bit::highest_set_bit(op.mantissa);
+ const int shift_amount = highest_set_bit - static_cast<int>(F) + extra_right_shift;
+ const u64 mantissa = Safe::LogicalShiftRight(op.mantissa, shift_amount);
+ const ResidualError error = ResidualErrorOnRightShift(op.mantissa, shift_amount);
+ const int exponent = op.exponent + highest_set_bit - normalized_point_position;
+ return std::make_tuple(op.sign, exponent, mantissa, error);
+}
+
+template<typename FPT>
+FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
+ ASSERT(op.mantissa != 0);
+ ASSERT(rounding != RoundingMode::ToNearest_TieAwayFromZero);
+
+ constexpr int minimum_exp = FPInfo<FPT>::exponent_min;
+ constexpr size_t E = FPInfo<FPT>::exponent_width;
+ constexpr size_t F = FPInfo<FPT>::explicit_mantissa_width;
+ constexpr bool isFP16 = FPInfo<FPT>::total_width == 16;
+
+ auto [sign, exponent, mantissa, error] = Normalize<F>(op);
+
+ if (((!isFP16 && fpcr.FZ()) || (isFP16 && fpcr.FZ16())) && exponent < minimum_exp) {
+ fpsr.UFC(true);
+ return FPInfo<FPT>::Zero(sign);
+ }
+
+ int biased_exp = std::max<int>(exponent - minimum_exp + 1, 0);
+ if (biased_exp == 0) {
+ std::tie(sign, exponent, mantissa, error) = Normalize<F>(op, minimum_exp - exponent);
+ }
+
+ if (biased_exp == 0 && (error != ResidualError::Zero || fpcr.UFE())) {
+ FPProcessException(FPExc::Underflow, fpcr, fpsr);
+ }
+
+ bool round_up = false, overflow_to_inf = false;
+ switch (rounding) {
+ case RoundingMode::ToNearest_TieEven: {
+ round_up = (error > ResidualError::Half) || (error == ResidualError::Half && mcl::bit::get_bit<0>(mantissa));
+ overflow_to_inf = true;
+ break;
+ }
+ case RoundingMode::TowardsPlusInfinity:
+ round_up = error != ResidualError::Zero && !sign;
+ overflow_to_inf = !sign;
+ break;
+ case RoundingMode::TowardsMinusInfinity:
+ round_up = error != ResidualError::Zero && sign;
+ overflow_to_inf = sign;
+ break;
+ default:
+ break;
+ }
+
+ if (round_up) {
+ if ((mantissa & FPInfo<FPT>::mantissa_mask) == FPInfo<FPT>::mantissa_mask) {
+ // Overflow on rounding up is going to happen
+ if (mantissa == FPInfo<FPT>::mantissa_mask) {
+ // Rounding up from denormal to normal
+ mantissa++;
+ biased_exp++;
+ } else {
+ // Rounding up to next exponent
+ mantissa = (mantissa + 1) / 2;
+ biased_exp++;
+ }
+ } else {
+ mantissa++;
+ }
+ }
+
+ if (error != ResidualError::Zero && rounding == RoundingMode::ToOdd) {
+ mantissa = mcl::bit::set_bit<0>(mantissa, true);
+ }
+
+ FPT result = 0;
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4127) // C4127: conditional expression is constant
+#endif
+ if (!isFP16 || !fpcr.AHP()) {
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+ constexpr int max_biased_exp = (1 << E) - 1;
+ if (biased_exp >= max_biased_exp) {
+ result = overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign);
+ FPProcessException(FPExc::Overflow, fpcr, fpsr);
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ } else {
+ result = sign ? 1 : 0;
+ result <<= E;
+ result += FPT(biased_exp);
+ result <<= F;
+ result |= static_cast<FPT>(mantissa) & FPInfo<FPT>::mantissa_mask;
+ if (error != ResidualError::Zero) {
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ }
+ }
+ } else {
+ constexpr int max_biased_exp = (1 << E);
+ if (biased_exp >= max_biased_exp) {
+ result = sign ? 0xFFFF : 0x7FFF;
+ FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
+ } else {
+ result = sign ? 1 : 0;
+ result <<= E;
+ result += FPT(biased_exp);
+ result <<= F;
+ result |= static_cast<FPT>(mantissa) & FPInfo<FPT>::mantissa_mask;
+ if (error != ResidualError::Zero) {
+ FPProcessException(FPExc::Inexact, fpcr, fpsr);
+ }
+ }
+ }
+ return result;
+}
+
+template u16 FPRoundBase<u16>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+template u32 FPRoundBase<u32>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+template u64 FPRoundBase<u64>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/unpacked.h b/externals/dynarmic/src/dynarmic/common/fp/unpacked.h
new file mode 100644
index 0000000000..77f33d8966
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/unpacked.h
@@ -0,0 +1,90 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <tuple>
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+
+namespace Dynarmic::FP {
+
+class FPSR;
+enum class RoundingMode;
+
+enum class FPType {
+ Nonzero,
+ Zero,
+ Infinity,
+ QNaN,
+ SNaN,
+};
+
+constexpr size_t normalized_point_position = 62;
+
+/// value = (sign ? -1 : +1) * mantissa/(2^62) * 2^exponent
+/// 63rd bit of mantissa is always set (unless value is zero)
+struct FPUnpacked {
+ bool sign;
+ int exponent;
+ u64 mantissa;
+};
+
+inline bool operator==(const FPUnpacked& a, const FPUnpacked& b) {
+ return std::tie(a.sign, a.exponent, a.mantissa) == std::tie(b.sign, b.exponent, b.mantissa);
+}
+
+/// return value = (sign ? -1 : +1) * value * 2^exponent
+constexpr FPUnpacked ToNormalized(bool sign, int exponent, u64 value) {
+ if (value == 0) {
+ return {sign, 0, 0};
+ }
+
+ const int highest_bit = mcl::bit::highest_set_bit(value);
+ const int offset = static_cast<int>(normalized_point_position) - highest_bit;
+ value <<= offset;
+ exponent -= offset - static_cast<int>(normalized_point_position);
+ return {sign, exponent, value};
+}
+
+template<typename FPT>
+std::tuple<FPType, bool, FPUnpacked> FPUnpackBase(FPT op, FPCR fpcr, FPSR& fpsr);
+
+template<typename FPT>
+std::tuple<FPType, bool, FPUnpacked> FPUnpack(FPT op, FPCR fpcr, FPSR& fpsr) {
+ fpcr.AHP(false);
+ return FPUnpackBase(op, fpcr, fpsr);
+}
+
+template<typename FPT>
+std::tuple<FPType, bool, FPUnpacked> FPUnpackCV(FPT op, FPCR fpcr, FPSR& fpsr) {
+ fpcr.FZ16(false);
+ return FPUnpackBase(op, fpcr, fpsr);
+}
+
+template<typename FPT>
+FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr);
+
+template<typename FPT>
+FPT FPRound(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
+ fpcr.AHP(false);
+ return FPRoundBase<FPT>(op, fpcr, rounding, fpsr);
+}
+
+template<typename FPT>
+FPT FPRoundCV(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) {
+ fpcr.FZ16(false);
+ return FPRoundBase<FPT>(op, fpcr, rounding, fpsr);
+}
+
+template<typename FPT>
+FPT FPRound(FPUnpacked op, FPCR fpcr, FPSR& fpsr) {
+ return FPRound<FPT>(op, fpcr, fpcr.RMode(), fpsr);
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/fp/util.h b/externals/dynarmic/src/dynarmic/common/fp/util.h
new file mode 100644
index 0000000000..fda34e3ee5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/fp/util.h
@@ -0,0 +1,99 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/common/fp/info.h"
+
+namespace Dynarmic::FP {
+
+/// Is floating point value a zero?
+template<typename FPT>
+inline bool IsZero(FPT value, FPCR fpcr) {
+ if (fpcr.FZ()) {
+ return (value & FPInfo<FPT>::exponent_mask) == 0;
+ }
+ return (value & ~FPInfo<FPT>::sign_mask) == 0;
+}
+
+/// Is floating point value an infinity?
+template<typename FPT>
+constexpr bool IsInf(FPT value) {
+ return (value & ~FPInfo<FPT>::sign_mask) == FPInfo<FPT>::Infinity(false);
+}
+
+/// Is floating point value a QNaN?
+template<typename FPT>
+constexpr bool IsQNaN(FPT value) {
+ constexpr FPT qnan_bits = FPInfo<FPT>::exponent_mask | FPInfo<FPT>::mantissa_msb;
+ return (value & qnan_bits) == qnan_bits;
+}
+
+/// Is floating point value a SNaN?
+template<typename FPT>
+constexpr bool IsSNaN(FPT value) {
+ constexpr FPT qnan_bits = FPInfo<FPT>::exponent_mask | FPInfo<FPT>::mantissa_msb;
+ constexpr FPT snan_bits = FPInfo<FPT>::exponent_mask;
+ return (value & qnan_bits) == snan_bits && (value & FPInfo<FPT>::mantissa_mask) != 0;
+}
+
+/// Is floating point value a NaN?
+template<typename FPT>
+constexpr bool IsNaN(FPT value) {
+ return IsQNaN(value) || IsSNaN(value);
+}
+
+/// Given a single argument, return the NaN value which would be returned by an ARM processor.
+/// If the argument isn't a NaN, returns std::nullopt.
+template<typename FPT>
+constexpr std::optional<FPT> ProcessNaNs(FPT a) {
+ if (IsSNaN(a)) {
+ return a | FPInfo<FPT>::mantissa_msb;
+ } else if (IsQNaN(a)) {
+ return a;
+ }
+ return std::nullopt;
+}
+
+/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor.
+/// If neither argument is a NaN, returns std::nullopt.
+template<typename FPT>
+constexpr std::optional<FPT> ProcessNaNs(FPT a, FPT b) {
+ if (IsSNaN(a)) {
+ return a | FPInfo<FPT>::mantissa_msb;
+ } else if (IsSNaN(b)) {
+ return b | FPInfo<FPT>::mantissa_msb;
+ } else if (IsQNaN(a)) {
+ return a;
+ } else if (IsQNaN(b)) {
+ return b;
+ }
+ return std::nullopt;
+}
+
+/// Given three arguments, return the NaN value which would be returned by an ARM processor.
+/// If none of the arguments is a NaN, returns std::nullopt.
+template<typename FPT>
+constexpr std::optional<FPT> ProcessNaNs(FPT a, FPT b, FPT c) {
+ if (IsSNaN(a)) {
+ return a | FPInfo<FPT>::mantissa_msb;
+ } else if (IsSNaN(b)) {
+ return b | FPInfo<FPT>::mantissa_msb;
+ } else if (IsSNaN(c)) {
+ return c | FPInfo<FPT>::mantissa_msb;
+ } else if (IsQNaN(a)) {
+ return a;
+ } else if (IsQNaN(b)) {
+ return b;
+ } else if (IsQNaN(c)) {
+ return c;
+ }
+ return std::nullopt;
+}
+
+} // namespace Dynarmic::FP
diff --git a/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp
new file mode 100644
index 0000000000..636ee3a250
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp
@@ -0,0 +1,128 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <string>
+
+#include <fmt/format.h>
+
+#ifdef DYNARMIC_USE_LLVM
+# include <llvm-c/Disassembler.h>
+# include <llvm-c/Target.h>
+#endif
+
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/llvm_disassemble.h"
+
+namespace Dynarmic::Common {
+
+std::string DisassembleX64(const void* begin, const void* end) {
+ std::string result;
+
+#ifdef DYNARMIC_USE_LLVM
+ LLVMInitializeX86TargetInfo();
+ LLVMInitializeX86TargetMC();
+ LLVMInitializeX86Disassembler();
+ LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm("x86_64", nullptr, 0, nullptr, nullptr);
+ LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant);
+
+ const u8* pos = reinterpret_cast<const u8*>(begin);
+ size_t remaining = reinterpret_cast<size_t>(end) - reinterpret_cast<size_t>(pos);
+ while (pos < end) {
+ char buffer[80];
+ size_t inst_size = LLVMDisasmInstruction(llvm_ctx, const_cast<u8*>(pos), remaining, reinterpret_cast<u64>(pos), buffer, sizeof(buffer));
+ ASSERT(inst_size);
+ for (const u8* i = pos; i < pos + inst_size; i++)
+ result += fmt::format("{:02x} ", *i);
+ for (size_t i = inst_size; i < 10; i++)
+ result += " ";
+ result += buffer;
+ result += '\n';
+
+ pos += inst_size;
+ remaining -= inst_size;
+ }
+
+ LLVMDisasmDispose(llvm_ctx);
+#else
+ result += fmt::format("(recompile with DYNARMIC_USE_LLVM=ON to disassemble the generated x86_64 code)\n");
+ result += fmt::format("start: {:016x}, end: {:016x}\n", mcl::bit_cast<u64>(begin), mcl::bit_cast<u64>(end));
+#endif
+
+ return result;
+}
+
+std::string DisassembleAArch32([[maybe_unused]] bool is_thumb, [[maybe_unused]] u32 pc, [[maybe_unused]] const u8* instructions, [[maybe_unused]] size_t length) {
+ std::string result;
+
+#ifdef DYNARMIC_USE_LLVM
+ LLVMInitializeARMTargetInfo();
+ LLVMInitializeARMTargetMC();
+ LLVMInitializeARMDisassembler();
+ LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm(is_thumb ? "thumbv8-arm" : "armv8-arm", nullptr, 0, nullptr, nullptr);
+ LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant);
+
+ char buffer[1024];
+ while (length) {
+ size_t inst_size = LLVMDisasmInstruction(llvm_ctx, const_cast<u8*>(instructions), length, pc, buffer, sizeof(buffer));
+ const char* const disassembled = inst_size > 0 ? buffer : "<invalid instruction>";
+
+ if (inst_size == 0)
+ inst_size = is_thumb ? 2 : 4;
+
+ result += fmt::format("{:08x} ", pc);
+ for (size_t i = 0; i < 4; i++) {
+ if (i < inst_size) {
+ result += fmt::format("{:02x}", instructions[inst_size - i - 1]);
+ } else {
+ result += " ";
+ }
+ }
+ result += disassembled;
+ result += '\n';
+
+ if (length <= inst_size)
+ break;
+
+ pc += inst_size;
+ instructions += inst_size;
+ length -= inst_size;
+ }
+
+ LLVMDisasmDispose(llvm_ctx);
+#else
+ result += fmt::format("(disassembly disabled)\n");
+#endif
+
+ return result;
+}
+
+std::string DisassembleAArch64([[maybe_unused]] u32 instruction, [[maybe_unused]] u64 pc) {
+ std::string result;
+
+#ifdef DYNARMIC_USE_LLVM
+ LLVMInitializeAArch64TargetInfo();
+ LLVMInitializeAArch64TargetMC();
+ LLVMInitializeAArch64Disassembler();
+ LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm("aarch64", nullptr, 0, nullptr, nullptr);
+ LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant);
+
+ char buffer[80];
+ size_t inst_size = LLVMDisasmInstruction(llvm_ctx, (u8*)&instruction, sizeof(instruction), pc, buffer, sizeof(buffer));
+ result = fmt::format("{:016x} {:08x} ", pc, instruction);
+ result += inst_size > 0 ? buffer : "<invalid instruction>";
+ result += '\n';
+
+ LLVMDisasmDispose(llvm_ctx);
+#else
+ result += fmt::format("(disassembly disabled)\n");
+#endif
+
+ return result;
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h
new file mode 100644
index 0000000000..56de791a56
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h
@@ -0,0 +1,18 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common {
+
+std::string DisassembleX64(const void* pos, const void* end);
+std::string DisassembleAArch32(bool is_thumb, u32 pc, const u8* instructions, size_t length);
+std::string DisassembleAArch64(u32 instruction, u64 pc = 0);
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/lut_from_list.h b/externals/dynarmic/src/dynarmic/common/lut_from_list.h
new file mode 100644
index 0000000000..ed9e3dc046
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/lut_from_list.h
@@ -0,0 +1,37 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <initializer_list>
+#include <map>
+#include <type_traits>
+
+#include <mcl/mp/metafunction/apply.hpp>
+#include <mcl/mp/typelist/list.hpp>
+#include <mcl/type_traits/is_instance_of_template.hpp>
+
+#ifdef _MSC_VER
+# include <mcl/mp/typelist/head.hpp>
+#endif
+
+namespace Dynarmic::Common {
+
+template<typename Function, typename... Values>
+inline auto GenerateLookupTableFromList(Function f, mcl::mp::list<Values...>) {
+#ifdef _MSC_VER
+ using PairT = std::invoke_result_t<Function, mcl::mp::head<mcl::mp::list<Values...>>>;
+#else
+ using PairT = std::common_type_t<std::invoke_result_t<Function, Values>...>;
+#endif
+ using MapT = mcl::mp::apply<std::map, PairT>;
+
+ static_assert(mcl::is_instance_of_template_v<std::pair, PairT>);
+
+ const std::initializer_list<PairT> pair_array{f(Values{})...};
+ return MapT(pair_array.begin(), pair_array.end());
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/math_util.cpp b/externals/dynarmic/src/dynarmic/common/math_util.cpp
new file mode 100644
index 0000000000..32211fd979
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/math_util.cpp
@@ -0,0 +1,68 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/math_util.h"
+
+#include <array>
+
+namespace Dynarmic::Common {
+
+u8 RecipEstimate(u64 a) {
+ using LUT = std::array<u8, 256>;
+ static constexpr u64 lut_offset = 256;
+
+ static const LUT lut = [] {
+ LUT result{};
+ for (u64 i = 0; i < result.size(); i++) {
+ u64 a = i + lut_offset;
+
+ a = a * 2 + 1;
+ u64 b = (1u << 19) / a;
+ result[i] = static_cast<u8>((b + 1) / 2);
+ }
+ return result;
+ }();
+
+ return lut[a - lut_offset];
+}
+
+/// Input is a u0.9 fixed point number. Only values in [0.25, 1.0) are valid.
+/// Output is a u0.8 fixed point number, with an implied 1 prefixed.
+/// i.e.: The output is a value in [1.0, 2.0).
+u8 RecipSqrtEstimate(u64 a) {
+ using LUT = std::array<u8, 512>;
+
+ static const LUT lut = [] {
+ LUT result{};
+ for (u64 i = 128; i < result.size(); i++) {
+ u64 a = i;
+
+ // Convert to u.10 (with 8 significant bits), force to odd
+ if (a < 256) {
+ // [0.25, 0.5)
+ a = a * 2 + 1;
+ } else {
+ // [0.5, 1.0)
+ a = (a | 1) * 2;
+ }
+
+ // Calculate largest b which for which b < 1.0 / sqrt(a).
+ // Start from b = 1.0 (in u.9) since b cannot be smaller.
+ u64 b = 512;
+ // u.10 * u.9 * u.9 -> u.28
+ while (a * (b + 1) * (b + 1) < (1u << 28)) {
+ b++;
+ }
+
+ // Round to nearest u0.8 (with implied set integer bit).
+ result[i] = static_cast<u8>((b + 1) / 2);
+ }
+ return result;
+ }();
+
+ return lut[a & 0x1FF];
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/math_util.h b/externals/dynarmic/src/dynarmic/common/math_util.h
new file mode 100644
index 0000000000..5c1f784c89
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/math_util.h
@@ -0,0 +1,47 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <utility>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common {
+
+/**
+ * This function is a workaround for a bug in MSVC 19.12 where fold expressions
+ * do not work when the /permissive- flag is enabled.
+ */
+template<typename T, typename... Ts>
+constexpr T Sum(T first, Ts&&... rest) {
+ if constexpr (sizeof...(rest) == 0) {
+ return first;
+ } else {
+ return first + Sum(std::forward<Ts>(rest)...);
+ }
+}
+
+/**
+ * Input is a u0.9 fixed point number. Only values in [0.5, 1.0) are valid.
+ * Output is a u0.8 fixed point number, with an implied 1 prefixed.
+ * i.e.: The output is a value in [1.0, 2.0).
+ *
+ * @see RecipEstimate() within the ARMv8 architecture reference manual
+ * for a general overview of the requirements of the algorithm.
+ */
+u8 RecipEstimate(u64 a);
+
+/**
+ * Input is a u0.9 fixed point number. Only values in [0.25, 1.0) are valid.
+ * Output is a u0.8 fixed point number, with an implied 1 prefixed.
+ * i.e.: The output is a value in [1.0, 2.0).
+ *
+ * @see RecipSqrtEstimate() within the ARMv8 architecture reference manual
+ * for a general overview of the requirements of the algorithm.
+ */
+u8 RecipSqrtEstimate(u64 a);
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/memory_pool.cpp b/externals/dynarmic/src/dynarmic/common/memory_pool.cpp
new file mode 100644
index 0000000000..a62cd11295
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/memory_pool.cpp
@@ -0,0 +1,44 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/memory_pool.h"
+
+#include <cstdlib>
+
+namespace Dynarmic::Common {
+
+Pool::Pool(size_t object_size, size_t initial_pool_size)
+ : object_size(object_size), slab_size(initial_pool_size) {
+ AllocateNewSlab();
+}
+
+Pool::~Pool() {
+ std::free(current_slab);
+
+ for (char* slab : slabs) {
+ std::free(slab);
+ }
+}
+
+void* Pool::Alloc() {
+ if (remaining == 0) {
+ slabs.push_back(current_slab);
+ AllocateNewSlab();
+ }
+
+ void* ret = static_cast<void*>(current_ptr);
+ current_ptr += object_size;
+ remaining--;
+
+ return ret;
+}
+
+void Pool::AllocateNewSlab() {
+ current_slab = static_cast<char*>(std::malloc(object_size * slab_size));
+ current_ptr = current_slab;
+ remaining = slab_size;
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/memory_pool.h b/externals/dynarmic/src/dynarmic/common/memory_pool.h
new file mode 100644
index 0000000000..1cbdd9095e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/memory_pool.h
@@ -0,0 +1,45 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace Dynarmic::Common {
+
+class Pool {
+public:
+ /**
+ * @param object_size Byte-size of objects to construct
+ * @param initial_pool_size Number of objects to have per slab
+ */
+ Pool(size_t object_size, size_t initial_pool_size);
+ ~Pool();
+
+ Pool(const Pool&) = delete;
+ Pool(Pool&&) = delete;
+
+ Pool& operator=(const Pool&) = delete;
+ Pool& operator=(Pool&&) = delete;
+
+ /// Returns a pointer to an `object_size`-bytes block of memory.
+ void* Alloc();
+
+private:
+ // Allocates a completely new memory slab.
+ // Used when an entirely new slab is needed
+ // due the current one running out of usable space.
+ void AllocateNewSlab();
+
+ size_t object_size;
+ size_t slab_size;
+ char* current_slab;
+ char* current_ptr;
+ size_t remaining;
+ std::vector<char*> slabs;
+};
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/safe_ops.h b/externals/dynarmic/src/dynarmic/common/safe_ops.h
new file mode 100644
index 0000000000..aef3134762
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/safe_ops.h
@@ -0,0 +1,115 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <type_traits>
+
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/u128.h"
+
+namespace Dynarmic::Safe {
+
+template<typename T>
+T LogicalShiftLeft(T value, int shift_amount);
+template<typename T>
+T LogicalShiftRight(T value, int shift_amount);
+template<typename T>
+T ArithmeticShiftLeft(T value, int shift_amount);
+template<typename T>
+T ArithmeticShiftRight(T value, int shift_amount);
+
+template<typename T>
+T LogicalShiftLeft(T value, int shift_amount) {
+ static_assert(std::is_integral_v<T>);
+
+ if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) {
+ return 0;
+ }
+
+ if (shift_amount < 0) {
+ return LogicalShiftRight(value, -shift_amount);
+ }
+
+ auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value);
+ return static_cast<T>(unsigned_value << shift_amount);
+}
+
+template<>
+inline u128 LogicalShiftLeft(u128 value, int shift_amount) {
+ return value << shift_amount;
+}
+
+template<typename T>
+T LogicalShiftRight(T value, int shift_amount) {
+ static_assert(std::is_integral_v<T>);
+
+ if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) {
+ return 0;
+ }
+
+ if (shift_amount < 0) {
+ return LogicalShiftLeft(value, -shift_amount);
+ }
+
+ auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value);
+ return static_cast<T>(unsigned_value >> shift_amount);
+}
+
+template<>
+inline u128 LogicalShiftRight(u128 value, int shift_amount) {
+ return value >> shift_amount;
+}
+
+template<typename T>
+T LogicalShiftRightDouble(T top, T bottom, int shift_amount) {
+ return LogicalShiftLeft(top, int(mcl::bitsizeof<T>) - shift_amount) | LogicalShiftRight(bottom, shift_amount);
+}
+
+template<typename T>
+T ArithmeticShiftLeft(T value, int shift_amount) {
+ static_assert(std::is_integral_v<T>);
+
+ if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) {
+ return 0;
+ }
+
+ if (shift_amount < 0) {
+ return ArithmeticShiftRight(value, -shift_amount);
+ }
+
+ auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value);
+ return static_cast<T>(unsigned_value << shift_amount);
+}
+
+template<typename T>
+T ArithmeticShiftRight(T value, int shift_amount) {
+ static_assert(std::is_integral_v<T>);
+
+ if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) {
+ return mcl::bit::most_significant_bit(value) ? ~static_cast<T>(0) : 0;
+ }
+
+ if (shift_amount < 0) {
+ return ArithmeticShiftLeft(value, -shift_amount);
+ }
+
+ auto signed_value = static_cast<std::make_signed_t<T>>(value);
+ return static_cast<T>(signed_value >> shift_amount);
+}
+
+template<typename T>
+T ArithmeticShiftRightDouble(T top, T bottom, int shift_amount) {
+ return ArithmeticShiftLeft(top, int(mcl::bitsizeof<T>) - shift_amount) | LogicalShiftRight(bottom, shift_amount);
+}
+
+template<typename T>
+T Negate(T value) {
+ return static_cast<T>(~static_cast<std::uintmax_t>(value) + 1);
+}
+
+} // namespace Dynarmic::Safe
diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock.h b/externals/dynarmic/src/dynarmic/common/spin_lock.h
new file mode 100644
index 0000000000..f653704db6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/spin_lock.h
@@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic {
+
+struct SpinLock {
+ void Lock();
+ void Unlock();
+
+ volatile int storage = 0;
+};
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp
new file mode 100644
index 0000000000..ccf807e2d2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp
@@ -0,0 +1,86 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mutex>
+
+#include <oaknut/code_block.hpp>
+#include <oaknut/oaknut.hpp>
+
+#include "dynarmic/backend/arm64/abi.h"
+#include "dynarmic/common/spin_lock.h"
+
+namespace Dynarmic {
+
+using Backend::Arm64::Wscratch0;
+using Backend::Arm64::Wscratch1;
+using namespace oaknut::util;
+
+void EmitSpinLockLock(oaknut::CodeGenerator& code, oaknut::XReg ptr) {
+ oaknut::Label start, loop;
+
+ code.MOV(Wscratch1, 1);
+ code.SEVL();
+ code.l(start);
+ code.WFE();
+ code.l(loop);
+ code.LDAXR(Wscratch0, ptr);
+ code.CBNZ(Wscratch0, start);
+ code.STXR(Wscratch0, Wscratch1, ptr);
+ code.CBNZ(Wscratch0, loop);
+}
+
+void EmitSpinLockUnlock(oaknut::CodeGenerator& code, oaknut::XReg ptr) {
+ code.STLR(WZR, ptr);
+}
+
+namespace {
+
+struct SpinLockImpl {
+ SpinLockImpl();
+
+ void Initialize();
+
+ oaknut::CodeBlock mem;
+ oaknut::CodeGenerator code;
+
+ void (*lock)(volatile int*);
+ void (*unlock)(volatile int*);
+};
+
+std::once_flag flag;
+SpinLockImpl impl;
+
+SpinLockImpl::SpinLockImpl()
+ : mem{4096}
+ , code{mem.ptr(), mem.ptr()} {}
+
+void SpinLockImpl::Initialize() {
+ mem.unprotect();
+
+ lock = code.xptr<void (*)(volatile int*)>();
+ EmitSpinLockLock(code, X0);
+ code.RET();
+
+ unlock = code.xptr<void (*)(volatile int*)>();
+ EmitSpinLockUnlock(code, X0);
+ code.RET();
+
+ mem.protect();
+ mem.invalidate_all();
+}
+
+} // namespace
+
+void SpinLock::Lock() {
+ std::call_once(flag, &SpinLockImpl::Initialize, impl);
+ impl.lock(&storage);
+}
+
+void SpinLock::Unlock() {
+ std::call_once(flag, &SpinLockImpl::Initialize, impl);
+ impl.unlock(&storage);
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h
new file mode 100644
index 0000000000..c0a86bfed3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h
@@ -0,0 +1,15 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <oaknut/oaknut.hpp>
+
+namespace Dynarmic {
+
+void EmitSpinLockLock(oaknut::CodeGenerator& code, oaknut::XReg ptr);
+void EmitSpinLockUnlock(oaknut::CodeGenerator& code, oaknut::XReg ptr);
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
new file mode 100644
index 0000000000..fdea94f4be
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp
@@ -0,0 +1,76 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mutex>
+
+#include <xbyak/xbyak.h>
+
+#include "dynarmic/backend/x64/abi.h"
+#include "dynarmic/backend/x64/hostloc.h"
+#include "dynarmic/common/spin_lock.h"
+
+namespace Dynarmic {
+
+void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
+ Xbyak::Label start, loop;
+
+ code.jmp(start);
+ code.L(loop);
+ code.pause();
+ code.L(start);
+ code.mov(tmp, 1);
+ code.lock();
+ code.xchg(code.dword[ptr], tmp);
+ code.test(tmp, tmp);
+ code.jnz(loop);
+}
+
+void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
+ code.xor_(tmp, tmp);
+ code.xchg(code.dword[ptr], tmp);
+ code.mfence();
+}
+
+namespace {
+
+struct SpinLockImpl {
+ void Initialize();
+
+ Xbyak::CodeGenerator code;
+
+ void (*lock)(volatile int*);
+ void (*unlock)(volatile int*);
+};
+
+std::once_flag flag;
+SpinLockImpl impl;
+
+void SpinLockImpl::Initialize() {
+ const Xbyak::Reg64 ABI_PARAM1 = Backend::X64::HostLocToReg64(Backend::X64::ABI_PARAM1);
+
+ code.align();
+ lock = code.getCurr<void (*)(volatile int*)>();
+ EmitSpinLockLock(code, ABI_PARAM1, code.eax);
+ code.ret();
+
+ code.align();
+ unlock = code.getCurr<void (*)(volatile int*)>();
+ EmitSpinLockUnlock(code, ABI_PARAM1, code.eax);
+ code.ret();
+}
+
+} // namespace
+
+void SpinLock::Lock() {
+ std::call_once(flag, &SpinLockImpl::Initialize, impl);
+ impl.lock(&storage);
+}
+
+void SpinLock::Unlock() {
+ std::call_once(flag, &SpinLockImpl::Initialize, impl);
+ impl.unlock(&storage);
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h
new file mode 100644
index 0000000000..df6a3d7407
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h
@@ -0,0 +1,15 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <xbyak/xbyak.h>
+
+namespace Dynarmic {
+
+void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp);
+void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp);
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/string_util.h b/externals/dynarmic/src/dynarmic/common/string_util.h
new file mode 100644
index 0000000000..900b3aadd2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/string_util.h
@@ -0,0 +1,15 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::Common {
+
+template<typename T>
+constexpr char SignToChar(T value) {
+ return value >= 0 ? '+' : '-';
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/u128.cpp b/externals/dynarmic/src/dynarmic/common/u128.cpp
new file mode 100644
index 0000000000..dd78ac97ef
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/u128.cpp
@@ -0,0 +1,142 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/u128.h"
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic {
+
+u128 Multiply64To128(u64 a, u64 b) {
+ const u32 a0 = static_cast<u32>(a);
+ const u32 b0 = static_cast<u32>(b);
+ const u32 a1 = static_cast<u32>(a >> 32);
+ const u32 b1 = static_cast<u32>(b >> 32);
+
+ // result = (c2 << 64) + (c1 << 32) + c0
+ // c2 = a1 * b1
+ // c1 = a1 * b0 + a0 * b1
+ // c0 = a0 * b0
+ // noting that these operations may overflow
+
+ const u64 c0 = static_cast<u64>(a0) * b0;
+ const u64 c1_0 = static_cast<u64>(a1) * b0;
+ const u64 c1_1 = static_cast<u64>(a0) * b1;
+ const u64 c2 = static_cast<u64>(a1) * b1;
+
+ const u64 c1 = c1_0 + c1_1;
+ const u64 c1_overflow = c1 < c1_0;
+
+ const u64 lower = c0 + (c1 << 32);
+ const u64 lower_overflow = lower < c0;
+
+ const u64 upper = lower_overflow + (c1 >> 32) + (c1_overflow << 32) + c2;
+
+ u128 result;
+ result.lower = lower;
+ result.upper = upper;
+ return result;
+}
+
+u128 operator<<(u128 operand, int amount) {
+ if (amount < 0) {
+ return operand >> -amount;
+ }
+
+ if (amount == 0) {
+ return operand;
+ }
+
+ if (amount < 64) {
+ u128 result;
+ result.lower = (operand.lower << amount);
+ result.upper = (operand.upper << amount) | (operand.lower >> (64 - amount));
+ return result;
+ }
+
+ if (amount < 128) {
+ u128 result;
+ result.upper = operand.lower << (amount - 64);
+ return result;
+ }
+
+ return {};
+}
+
+u128 operator>>(u128 operand, int amount) {
+ if (amount < 0) {
+ return operand << -amount;
+ }
+
+ if (amount == 0) {
+ return operand;
+ }
+
+ if (amount < 64) {
+ u128 result;
+ result.lower = (operand.lower >> amount) | (operand.upper << (64 - amount));
+ result.upper = (operand.upper >> amount);
+ return result;
+ }
+
+ if (amount < 128) {
+ u128 result;
+ result.lower = operand.upper >> (amount - 64);
+ return result;
+ }
+
+ return {};
+}
+
+u128 StickyLogicalShiftRight(u128 operand, int amount) {
+ if (amount < 0) {
+ return operand << -amount;
+ }
+
+ if (amount == 0) {
+ return operand;
+ }
+
+ if (amount < 64) {
+ u128 result;
+ result.lower = (operand.lower >> amount) | (operand.upper << (64 - amount));
+ result.upper = (operand.upper >> amount);
+ // Sticky bit
+ if ((operand.lower << (64 - amount)) != 0) {
+ result.lower |= 1;
+ }
+ return result;
+ }
+
+ if (amount == 64) {
+ u128 result;
+ result.lower = operand.upper;
+ // Sticky bit
+ if (operand.lower != 0) {
+ result.lower |= 1;
+ }
+ return result;
+ }
+
+ if (amount < 128) {
+ u128 result;
+ result.lower = operand.upper >> (amount - 64);
+ // Sticky bit
+ if (operand.lower != 0) {
+ result.lower |= 1;
+ }
+ if ((operand.upper << (128 - amount)) != 0) {
+ result.lower |= 1;
+ }
+ return result;
+ }
+
+ if (operand.lower != 0 || operand.upper != 0) {
+ return u128(1);
+ }
+ return {};
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/u128.h b/externals/dynarmic/src/dynarmic/common/u128.h
new file mode 100644
index 0000000000..f6df1ae6cc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/u128.h
@@ -0,0 +1,99 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic {
+
+struct u128 {
+ u128() = default;
+ u128(const u128&) = default;
+ u128(u128&&) = default;
+ u128& operator=(const u128&) = default;
+ u128& operator=(u128&&) = default;
+
+ u128(u64 lower_, u64 upper_)
+ : lower(lower_), upper(upper_) {}
+
+ template<typename T>
+ /* implicit */ u128(T value)
+ : lower(value), upper(0) {
+ static_assert(std::is_integral_v<T>);
+ static_assert(mcl::bitsizeof<T> <= mcl::bitsizeof<u64>);
+ }
+
+ u64 lower = 0;
+ u64 upper = 0;
+
+ template<size_t bit_position>
+ bool Bit() const {
+ static_assert(bit_position < 128);
+ if constexpr (bit_position < 64) {
+ return mcl::bit::get_bit<bit_position>(lower);
+ } else {
+ return mcl::bit::get_bit<bit_position - 64>(upper);
+ }
+ }
+};
+
+static_assert(mcl::bitsizeof<u128> == 128);
+static_assert(std::is_standard_layout_v<u128>);
+static_assert(std::is_trivially_copyable_v<u128>);
+
+u128 Multiply64To128(u64 a, u64 b);
+
+inline u128 operator+(u128 a, u128 b) {
+ u128 result;
+ result.lower = a.lower + b.lower;
+ result.upper = a.upper + b.upper + (a.lower > result.lower);
+ return result;
+}
+
+inline u128 operator-(u128 a, u128 b) {
+ u128 result;
+ result.lower = a.lower - b.lower;
+ result.upper = a.upper - b.upper - (a.lower < result.lower);
+ return result;
+}
+
+inline bool operator<(u128 a, u128 b) {
+ return std::tie(a.upper, a.lower) < std::tie(b.upper, b.lower);
+}
+
+inline bool operator>(u128 a, u128 b) {
+ return operator<(b, a);
+}
+
+inline bool operator<=(u128 a, u128 b) {
+ return !operator>(a, b);
+}
+
+inline bool operator>=(u128 a, u128 b) {
+ return !operator<(a, b);
+}
+
+inline bool operator==(u128 a, u128 b) {
+ return std::tie(a.upper, a.lower) == std::tie(b.upper, b.lower);
+}
+
+inline bool operator!=(u128 a, u128 b) {
+ return !operator==(a, b);
+}
+
+u128 operator<<(u128 operand, int amount);
+u128 operator>>(u128 operand, int amount);
+
+/// LSB is a "sticky-bit".
+/// If a 1 is shifted off, the LSB would be set.
+u128 StickyLogicalShiftRight(u128 operand, int amount);
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/common/variant_util.h b/externals/dynarmic/src/dynarmic/common/variant_util.h
new file mode 100644
index 0000000000..4dd7f67167
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/variant_util.h
@@ -0,0 +1,29 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <boost/variant.hpp>
+
+namespace Dynarmic::Common {
+namespace detail {
+
+template<typename ReturnT, typename Lambda>
+struct VariantVisitor : boost::static_visitor<ReturnT>
+ , Lambda {
+ VariantVisitor(Lambda&& lambda)
+ : Lambda(std::move(lambda)) {}
+
+ using Lambda::operator();
+};
+
+} // namespace detail
+
+template<typename ReturnT, typename Variant, typename Lambda>
+inline ReturnT VisitVariant(Variant&& variant, Lambda&& lambda) {
+ return boost::apply_visitor(detail::VariantVisitor<ReturnT, Lambda>(std::move(lambda)), variant);
+}
+
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp b/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp
new file mode 100644
index 0000000000..854de23a77
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp
@@ -0,0 +1,57 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/x64_disassemble.h"
+
+#include <Zydis/Zydis.h>
+#include <fmt/printf.h>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common {
+
+void DumpDisassembledX64(const void* ptr, size_t size) {
+ ZydisDecoder decoder;
+ ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
+
+ ZydisFormatter formatter;
+ ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
+
+ size_t offset = 0;
+ ZydisDecodedInstruction instruction;
+ ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
+ while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction, operands))) {
+ fmt::print("{:016x} ", (u64)ptr + offset);
+
+ char buffer[256];
+ ZydisFormatterFormatInstruction(&formatter, &instruction, operands, instruction.operand_count_visible, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset, ZYAN_NULL);
+ puts(buffer);
+
+ offset += instruction.length;
+ }
+}
+
+std::vector<std::string> DisassembleX64(const void* ptr, size_t size) {
+ std::vector<std::string> result;
+ ZydisDecoder decoder;
+ ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
+
+ ZydisFormatter formatter;
+ ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
+
+ size_t offset = 0;
+ ZydisDecodedInstruction instruction;
+ ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
+ while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction, operands))) {
+ char buffer[256];
+ ZydisFormatterFormatInstruction(&formatter, &instruction, operands, instruction.operand_count_visible, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset, ZYAN_NULL);
+
+ result.push_back(fmt::format("{:016x} {}", (u64)ptr + offset, buffer));
+
+ offset += instruction.length;
+ }
+
+ return result;
+}
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/common/x64_disassemble.h b/externals/dynarmic/src/dynarmic/common/x64_disassemble.h
new file mode 100644
index 0000000000..03c511bfda
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.h
@@ -0,0 +1,21 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::Common {
+
+void DumpDisassembledX64(const void* ptr, size_t size);
+/**
+ * Disassemble `size' bytes from `ptr' and return the disassembled lines as a vector
+ * of strings.
+ */
+std::vector<std::string> DisassembleX64(const void* ptr, size_t size);
+} // namespace Dynarmic::Common
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h b/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h
new file mode 100644
index 0000000000..28414e9fa1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h
@@ -0,0 +1,191 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/rounding_mode.h"
+
+namespace Dynarmic::A32 {
+
+/**
+ * Representation of the Floating-Point Status and Control Register.
+ */
+class FPSCR final {
+public:
+ FPSCR() = default;
+ FPSCR(const FPSCR&) = default;
+ FPSCR(FPSCR&&) = default;
+ explicit FPSCR(u32 data)
+ : value{data & mask} {}
+
+ FPSCR& operator=(const FPSCR&) = default;
+ FPSCR& operator=(FPSCR&&) = default;
+ FPSCR& operator=(u32 data) {
+ value = data & mask;
+ return *this;
+ }
+
+ /// Negative condition flag.
+ bool N() const {
+ return mcl::bit::get_bit<31>(value);
+ }
+
+ /// Zero condition flag.
+ bool Z() const {
+ return mcl::bit::get_bit<30>(value);
+ }
+
+ /// Carry condition flag.
+ bool C() const {
+ return mcl::bit::get_bit<29>(value);
+ }
+
+ /// Overflow condition flag.
+ bool V() const {
+ return mcl::bit::get_bit<28>(value);
+ }
+
+ /// Cumulative saturation flag.
+ bool QC() const {
+ return mcl::bit::get_bit<27>(value);
+ }
+
+ /// Alternate half-precision control flag.
+ bool AHP() const {
+ return mcl::bit::get_bit<26>(value);
+ }
+
+ /// Default NaN mode control bit.
+ bool DN() const {
+ return mcl::bit::get_bit<25>(value);
+ }
+
+ /// Flush-to-zero mode control bit.
+ bool FTZ() const {
+ return mcl::bit::get_bit<24>(value);
+ }
+
+ /// Rounding mode control field.
+ FP::RoundingMode RMode() const {
+ return static_cast<FP::RoundingMode>(mcl::bit::get_bits<22, 23>(value));
+ }
+
+ /// Indicates the stride of a vector.
+ std::optional<size_t> Stride() const {
+ switch (mcl::bit::get_bits<20, 21>(value)) {
+ case 0b00:
+ return 1;
+ case 0b11:
+ return 2;
+ default:
+ return std::nullopt;
+ }
+ }
+
+ /// Indicates the length of a vector.
+ size_t Len() const {
+ return mcl::bit::get_bits<16, 18>(value) + 1;
+ }
+
+ /// Input denormal exception trap enable flag.
+ bool IDE() const {
+ return mcl::bit::get_bit<15>(value);
+ }
+
+ /// Inexact exception trap enable flag.
+ bool IXE() const {
+ return mcl::bit::get_bit<12>(value);
+ }
+
+ /// Underflow exception trap enable flag.
+ bool UFE() const {
+ return mcl::bit::get_bit<11>(value);
+ }
+
+ /// Overflow exception trap enable flag.
+ bool OFE() const {
+ return mcl::bit::get_bit<10>(value);
+ }
+
+ /// Division by zero exception trap enable flag.
+ bool DZE() const {
+ return mcl::bit::get_bit<9>(value);
+ }
+
+ /// Invalid operation exception trap enable flag.
+ bool IOE() const {
+ return mcl::bit::get_bit<8>(value);
+ }
+
+ /// Input denormal cumulative exception bit.
+ bool IDC() const {
+ return mcl::bit::get_bit<7>(value);
+ }
+
+ /// Inexact cumulative exception bit.
+ bool IXC() const {
+ return mcl::bit::get_bit<4>(value);
+ }
+
+ /// Underflow cumulative exception bit.
+ bool UFC() const {
+ return mcl::bit::get_bit<3>(value);
+ }
+
+ /// Overflow cumulative exception bit.
+ bool OFC() const {
+ return mcl::bit::get_bit<2>(value);
+ }
+
+ /// Division by zero cumulative exception bit.
+ bool DZC() const {
+ return mcl::bit::get_bit<1>(value);
+ }
+
+ /// Invalid operation cumulative exception bit.
+ bool IOC() const {
+ return mcl::bit::get_bit<0>(value);
+ }
+
+ /**
+ * Whether or not the FPSCR indicates RunFast mode.
+ *
+ * RunFast mode is enabled when:
+ * - Flush-to-zero is enabled
+ * - Default NaNs are enabled.
+ * - All exception enable bits are cleared.
+ */
+ bool InRunFastMode() const {
+ constexpr u32 runfast_mask = 0x03001F00;
+ constexpr u32 expected = 0x03000000;
+
+ return (value & runfast_mask) == expected;
+ }
+
+ /// Gets the underlying raw value within the FPSCR.
+ u32 Value() const {
+ return value;
+ }
+
+private:
+ // Bits 5-6, 13-14, and 19 are reserved.
+ static constexpr u32 mask = 0xFFF79F9F;
+ u32 value = 0;
+};
+
+inline bool operator==(FPSCR lhs, FPSCR rhs) {
+ return lhs.Value() == rhs.Value();
+}
+
+inline bool operator!=(FPSCR lhs, FPSCR rhs) {
+ return !operator==(lhs, rhs);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h b/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h
new file mode 100644
index 0000000000..ae69fa1e3f
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h
@@ -0,0 +1,64 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/cond.h"
+
+namespace Dynarmic::A32 {
+
+class ITState final {
+public:
+ ITState() = default;
+ explicit ITState(u8 data)
+ : value(data) {}
+
+ ITState& operator=(u8 data) {
+ value = data;
+ return *this;
+ }
+
+ IR::Cond Cond() const {
+ if (value == 0b00000000) {
+ return IR::Cond::AL;
+ }
+ return static_cast<IR::Cond>(mcl::bit::get_bits<4, 7>(value));
+ }
+
+ bool IsInITBlock() const {
+ return mcl::bit::get_bits<0, 3>(value) != 0b0000;
+ }
+
+ bool IsLastInITBlock() const {
+ return mcl::bit::get_bits<0, 3>(value) == 0b1000;
+ }
+
+ ITState Advance() const {
+ if (mcl::bit::get_bits<0, 2>(value) == 0b000) {
+ return ITState{0b00000000};
+ }
+ return ITState{mcl::bit::set_bits<0, 4>(value, static_cast<u8>(mcl::bit::get_bits<0, 4>(value) << 1))};
+ }
+
+ u8 Value() const {
+ return value;
+ }
+
+private:
+ u8 value = 0;
+};
+
+inline bool operator==(ITState lhs, ITState rhs) {
+ return lhs.Value() == rhs.Value();
+}
+
+inline bool operator!=(ITState lhs, ITState rhs) {
+ return !operator==(lhs, rhs);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h b/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h
new file mode 100644
index 0000000000..9af78eaae5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h
@@ -0,0 +1,224 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/ITState.h"
+
+namespace Dynarmic::A32 {
+
+/**
+ * Program Status Register
+ *
+ * | Bit(s) | Description |
+ * |:-------:|:----------------------------------------------|
+ * | N | Negative |
+ * | Z | Zero |
+ * | C | Carry |
+ * | V | Overflow |
+ * | Q | Sticky overflow for DSP-oriented instructions |
+ * | IT[1:0] | Lower two bits of the If-Then execution state |
+ * | J | Jazelle bit |
+ * | GE | Greater-than or Equal |
+ * | IT[7:2] | Upper six bits of the If-Then execution state |
+ * | E | Endian (0 is little endian, 1 is big endian) |
+ * | A | Imprecise data abort (disables them when set) |
+ * | I | IRQ interrupts (disabled when set) |
+ * | F | FIQ interrupts (disabled when set) |
+ * | T | Thumb bit |
+ * | M | Current processor mode |
+ */
+class PSR final {
+public:
+ /// Valid processor modes that may be indicated.
+ enum class Mode : u32 {
+ User = 0b10000,
+ FIQ = 0b10001,
+ IRQ = 0b10010,
+ Supervisor = 0b10011,
+ Monitor = 0b10110,
+ Abort = 0b10111,
+ Hypervisor = 0b11010,
+ Undefined = 0b11011,
+ System = 0b11111
+ };
+
+ /// Instruction sets that may be signified through a PSR.
+ enum class InstructionSet {
+ ARM,
+ Jazelle,
+ Thumb,
+ ThumbEE
+ };
+
+ PSR() = default;
+ explicit PSR(u32 data)
+ : value{data & mask} {}
+
+ PSR& operator=(u32 data) {
+ value = data & mask;
+ return *this;
+ }
+
+ bool N() const {
+ return mcl::bit::get_bit<31>(value);
+ }
+ void N(bool set) {
+ value = mcl::bit::set_bit<31>(value, set);
+ }
+
+ bool Z() const {
+ return mcl::bit::get_bit<30>(value);
+ }
+ void Z(bool set) {
+ value = mcl::bit::set_bit<30>(value, set);
+ }
+
+ bool C() const {
+ return mcl::bit::get_bit<29>(value);
+ }
+ void C(bool set) {
+ value = mcl::bit::set_bit<29>(value, set);
+ }
+
+ bool V() const {
+ return mcl::bit::get_bit<28>(value);
+ }
+ void V(bool set) {
+ value = mcl::bit::set_bit<28>(value, set);
+ }
+
+ bool Q() const {
+ return mcl::bit::get_bit<27>(value);
+ }
+ void Q(bool set) {
+ value = mcl::bit::set_bit<27>(value, set);
+ }
+
+ bool J() const {
+ return mcl::bit::get_bit<24>(value);
+ }
+ void J(bool set) {
+ value = mcl::bit::set_bit<24>(value, set);
+ }
+
+ u32 GE() const {
+ return mcl::bit::get_bits<16, 19>(value);
+ }
+ void GE(u32 data) {
+ value = mcl::bit::set_bits<16, 19>(value, data);
+ }
+
+ ITState IT() const {
+ return ITState{static_cast<u8>((value & 0x6000000) >> 25 | (value & 0xFC00) >> 8)};
+ }
+ void IT(ITState it_state) {
+ const u32 data = it_state.Value();
+ value = (value & ~0x000FC00) | (data & 0b11111100) << 8;
+ value = (value & ~0x6000000) | (data & 0b00000011) << 25;
+ }
+
+ bool E() const {
+ return mcl::bit::get_bit<9>(value);
+ }
+ void E(bool set) {
+ value = mcl::bit::set_bit<9>(value, set);
+ }
+
+ bool A() const {
+ return mcl::bit::get_bit<8>(value);
+ }
+ void A(bool set) {
+ value = mcl::bit::set_bit<8>(value, set);
+ }
+
+ bool I() const {
+ return mcl::bit::get_bit<7>(value);
+ }
+ void I(bool set) {
+ value = mcl::bit::set_bit<7>(value, set);
+ }
+
+ bool F() const {
+ return mcl::bit::get_bit<6>(value);
+ }
+ void F(bool set) {
+ value = mcl::bit::set_bit<6>(value, set);
+ }
+
+ bool T() const {
+ return mcl::bit::get_bit<5>(value);
+ }
+ void T(bool set) {
+ value = mcl::bit::set_bit<5>(value, set);
+ }
+
+ Mode M() const {
+ return static_cast<Mode>(mcl::bit::get_bits<0, 4>(value));
+ }
+ void M(Mode mode) {
+ value = mcl::bit::set_bits<0, 4>(value, static_cast<u32>(mode));
+ }
+
+ u32 Value() const {
+ return value;
+ }
+
+ InstructionSet CurrentInstructionSet() const {
+ const bool j_bit = J();
+ const bool t_bit = T();
+
+ if (j_bit && t_bit)
+ return InstructionSet::ThumbEE;
+
+ if (t_bit)
+ return InstructionSet::Thumb;
+
+ if (j_bit)
+ return InstructionSet::Jazelle;
+
+ return InstructionSet::ARM;
+ }
+
+ void CurrentInstructionSet(InstructionSet instruction_set) {
+ switch (instruction_set) {
+ case InstructionSet::ARM:
+ T(false);
+ J(false);
+ break;
+ case InstructionSet::Jazelle:
+ T(false);
+ J(true);
+ break;
+ case InstructionSet::Thumb:
+ T(true);
+ J(false);
+ break;
+ case InstructionSet::ThumbEE:
+ T(true);
+ J(true);
+ break;
+ }
+ }
+
+private:
+ // Bits 20-23 are reserved and should be zero.
+ static constexpr u32 mask = 0xFF0FFFFF;
+
+ u32 value = 0;
+};
+
+inline bool operator==(PSR lhs, PSR rhs) {
+ return lhs.Value() == rhs.Value();
+}
+
+inline bool operator!=(PSR lhs, PSR rhs) {
+ return !operator==(lhs, rhs);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp
new file mode 100644
index 0000000000..343e521aba
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp
@@ -0,0 +1,442 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/a32_ir_emitter.h"
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/interface/A32/arch_version.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::A32 {
+
+using Opcode = IR::Opcode;
+
+size_t IREmitter::ArchVersion() const {
+ switch (arch_version) {
+ case ArchVersion::v3:
+ return 3;
+ case ArchVersion::v4:
+ case ArchVersion::v4T:
+ return 4;
+ case ArchVersion::v5TE:
+ return 5;
+ case ArchVersion::v6K:
+ case ArchVersion::v6T2:
+ return 6;
+ case ArchVersion::v7:
+ return 7;
+ case ArchVersion::v8:
+ return 8;
+ }
+ UNREACHABLE();
+}
+
+u32 IREmitter::PC() const {
+ const u32 offset = current_location.TFlag() ? 4 : 8;
+ return current_location.PC() + offset;
+}
+
+u32 IREmitter::AlignPC(size_t alignment) const {
+ const u32 pc = PC();
+ return static_cast<u32>(pc - pc % alignment);
+}
+
+IR::U32 IREmitter::GetRegister(Reg reg) {
+ if (reg == A32::Reg::PC) {
+ return Imm32(PC());
+ }
+ return Inst<IR::U32>(Opcode::A32GetRegister, IR::Value(reg));
+}
+
+IR::U32U64 IREmitter::GetExtendedRegister(ExtReg reg) {
+ if (A32::IsSingleExtReg(reg)) {
+ return Inst<IR::U32U64>(Opcode::A32GetExtendedRegister32, IR::Value(reg));
+ }
+
+ if (A32::IsDoubleExtReg(reg)) {
+ return Inst<IR::U32U64>(Opcode::A32GetExtendedRegister64, IR::Value(reg));
+ }
+
+ ASSERT_FALSE("Invalid reg.");
+}
+
+IR::U128 IREmitter::GetVector(ExtReg reg) {
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+ return Inst<IR::U128>(Opcode::A32GetVector, IR::Value(reg));
+}
+
+void IREmitter::SetRegister(const Reg reg, const IR::U32& value) {
+ ASSERT(reg != A32::Reg::PC);
+ Inst(Opcode::A32SetRegister, IR::Value(reg), value);
+}
+
+void IREmitter::SetExtendedRegister(const ExtReg reg, const IR::U32U64& value) {
+ if (A32::IsSingleExtReg(reg)) {
+ Inst(Opcode::A32SetExtendedRegister32, IR::Value(reg), value);
+ } else if (A32::IsDoubleExtReg(reg)) {
+ Inst(Opcode::A32SetExtendedRegister64, IR::Value(reg), value);
+ } else {
+ ASSERT_FALSE("Invalid reg.");
+ }
+}
+
+void IREmitter::SetVector(ExtReg reg, const IR::U128& value) {
+ ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg));
+ Inst(Opcode::A32SetVector, IR::Value(reg), value);
+}
+
+void IREmitter::ALUWritePC(const IR::U32& value) {
+ // This behaviour is ARM version-dependent.
+ if (ArchVersion() >= 7 && !current_location.TFlag()) {
+ BXWritePC(value);
+ } else {
+ BranchWritePC(value);
+ }
+}
+
+void IREmitter::BranchWritePC(const IR::U32& value) {
+ if (!current_location.TFlag()) {
+ // Note that for ArchVersion() < 6, this is UNPREDICTABLE when value<1:0> != 0b00
+ const auto new_pc = And(value, Imm32(0xFFFFFFFC));
+ Inst(Opcode::A32SetRegister, IR::Value(A32::Reg::PC), new_pc);
+ } else {
+ const auto new_pc = And(value, Imm32(0xFFFFFFFE));
+ Inst(Opcode::A32SetRegister, IR::Value(A32::Reg::PC), new_pc);
+ }
+}
+
+void IREmitter::BXWritePC(const IR::U32& value) {
+ Inst(Opcode::A32BXWritePC, value);
+}
+
+void IREmitter::LoadWritePC(const IR::U32& value) {
+ // This behaviour is ARM version-dependent.
+ if (ArchVersion() >= 5) {
+ BXWritePC(value);
+ } else {
+ BranchWritePC(value);
+ }
+}
+
+void IREmitter::UpdateUpperLocationDescriptor() {
+ Inst(Opcode::A32UpdateUpperLocationDescriptor);
+}
+
+void IREmitter::CallSupervisor(const IR::U32& value) {
+ Inst(Opcode::A32CallSupervisor, value);
+}
+
+void IREmitter::ExceptionRaised(const Exception exception) {
+ Inst(Opcode::A32ExceptionRaised, Imm32(current_location.PC()), Imm64(static_cast<u64>(exception)));
+}
+
+IR::U32 IREmitter::GetCpsr() {
+ return Inst<IR::U32>(Opcode::A32GetCpsr);
+}
+
+void IREmitter::SetCpsr(const IR::U32& value) {
+ Inst(Opcode::A32SetCpsr, value);
+}
+
+void IREmitter::SetCpsrNZCV(const IR::NZCV& value) {
+ Inst(Opcode::A32SetCpsrNZCV, value);
+}
+
+void IREmitter::SetCpsrNZCVRaw(const IR::U32& value) {
+ Inst(Opcode::A32SetCpsrNZCVRaw, value);
+}
+
+void IREmitter::SetCpsrNZCVQ(const IR::U32& value) {
+ Inst(Opcode::A32SetCpsrNZCVQ, value);
+}
+
+void IREmitter::SetCheckBit(const IR::U1& value) {
+ Inst(Opcode::A32SetCheckBit, value);
+}
+
+IR::U1 IREmitter::GetOverflowFrom(const IR::Value& value) {
+ return Inst<IR::U1>(Opcode::GetOverflowFromOp, value);
+}
+
+IR::U1 IREmitter::GetCFlag() {
+ return Inst<IR::U1>(Opcode::A32GetCFlag);
+}
+
+void IREmitter::OrQFlag(const IR::U1& value) {
+ Inst(Opcode::A32OrQFlag, value);
+}
+
+IR::U32 IREmitter::GetGEFlags() {
+ return Inst<IR::U32>(Opcode::A32GetGEFlags);
+}
+
+void IREmitter::SetGEFlags(const IR::U32& value) {
+ Inst(Opcode::A32SetGEFlags, value);
+}
+
+void IREmitter::SetGEFlagsCompressed(const IR::U32& value) {
+ Inst(Opcode::A32SetGEFlagsCompressed, value);
+}
+
+IR::NZCV IREmitter::NZFrom(const IR::Value& value) {
+ return Inst<IR::NZCV>(Opcode::GetNZFromOp, value);
+}
+
+void IREmitter::SetCpsrNZ(const IR::NZCV& nz) {
+ Inst(Opcode::A32SetCpsrNZ, nz);
+}
+
+void IREmitter::SetCpsrNZC(const IR::NZCV& nz, const IR::U1& c) {
+ Inst(Opcode::A32SetCpsrNZC, nz, c);
+}
+
+void IREmitter::DataSynchronizationBarrier() {
+ Inst(Opcode::A32DataSynchronizationBarrier);
+}
+
+void IREmitter::DataMemoryBarrier() {
+ Inst(Opcode::A32DataMemoryBarrier);
+}
+
+void IREmitter::InstructionSynchronizationBarrier() {
+ Inst(Opcode::A32InstructionSynchronizationBarrier);
+}
+
+IR::U32 IREmitter::GetFpscr() {
+ return Inst<IR::U32>(Opcode::A32GetFpscr);
+}
+
+void IREmitter::SetFpscr(const IR::U32& new_fpscr) {
+ Inst(Opcode::A32SetFpscr, new_fpscr);
+}
+
+IR::U32 IREmitter::GetFpscrNZCV() {
+ return Inst<IR::U32>(Opcode::A32GetFpscrNZCV);
+}
+
+void IREmitter::SetFpscrNZCV(const IR::NZCV& new_fpscr_nzcv) {
+ Inst(Opcode::A32SetFpscrNZCV, new_fpscr_nzcv);
+}
+
+void IREmitter::ClearExclusive() {
+ Inst(Opcode::A32ClearExclusive);
+}
+
+IR::UAny IREmitter::ReadMemory(size_t bitsize, const IR::U32& vaddr, IR::AccType acc_type) {
+ switch (bitsize) {
+ case 8:
+ return ReadMemory8(vaddr, acc_type);
+ case 16:
+ return ReadMemory16(vaddr, acc_type);
+ case 32:
+ return ReadMemory32(vaddr, acc_type);
+ case 64:
+ return ReadMemory64(vaddr, acc_type);
+ }
+ ASSERT_FALSE("Invalid bitsize");
+}
+
+IR::U8 IREmitter::ReadMemory8(const IR::U32& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U8>(Opcode::A32ReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U16 IREmitter::ReadMemory16(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U16>(Opcode::A32ReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ return current_location.EFlag() ? ByteReverseHalf(value) : value;
+}
+
+IR::U32 IREmitter::ReadMemory32(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U32>(Opcode::A32ReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ return current_location.EFlag() ? ByteReverseWord(value) : value;
+}
+
+IR::U64 IREmitter::ReadMemory64(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U64>(Opcode::A32ReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ return current_location.EFlag() ? ByteReverseDual(value) : value;
+}
+
+IR::U8 IREmitter::ExclusiveReadMemory8(const IR::U32& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U8>(Opcode::A32ExclusiveReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U16 IREmitter::ExclusiveReadMemory16(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U16>(Opcode::A32ExclusiveReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ return current_location.EFlag() ? ByteReverseHalf(value) : value;
+}
+
+IR::U32 IREmitter::ExclusiveReadMemory32(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U32>(Opcode::A32ExclusiveReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ return current_location.EFlag() ? ByteReverseWord(value) : value;
+}
+
+std::pair<IR::U32, IR::U32> IREmitter::ExclusiveReadMemory64(const IR::U32& vaddr, IR::AccType acc_type) {
+ const auto value = Inst<IR::U64>(Opcode::A32ExclusiveReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+ const auto lo = LeastSignificantWord(value);
+ const auto hi = MostSignificantWord(value).result;
+ if (current_location.EFlag()) {
+ // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR
+ return std::make_pair(ByteReverseWord(lo), ByteReverseWord(hi));
+ }
+ return std::make_pair(lo, hi);
+}
+
+void IREmitter::WriteMemory(size_t bitsize, const IR::U32& vaddr, const IR::UAny& value, IR::AccType acc_type) {
+ switch (bitsize) {
+ case 8:
+ return WriteMemory8(vaddr, value, acc_type);
+ case 16:
+ return WriteMemory16(vaddr, value, acc_type);
+ case 32:
+ return WriteMemory32(vaddr, value, acc_type);
+ case 64:
+ return WriteMemory64(vaddr, value, acc_type);
+ }
+ ASSERT_FALSE("Invalid bitsize");
+}
+
+void IREmitter::WriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type) {
+ Inst(Opcode::A32WriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto v = ByteReverseHalf(value);
+ Inst(Opcode::A32WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type});
+ } else {
+ Inst(Opcode::A32WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+ }
+}
+
+void IREmitter::WriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto v = ByteReverseWord(value);
+ Inst(Opcode::A32WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type});
+ } else {
+ Inst(Opcode::A32WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+ }
+}
+
+void IREmitter::WriteMemory64(const IR::U32& vaddr, const IR::U64& value, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto v = ByteReverseDual(value);
+ Inst(Opcode::A32WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type});
+ } else {
+ Inst(Opcode::A32WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+ }
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto v = ByteReverseHalf(value);
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type});
+ } else {
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+ }
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto v = ByteReverseWord(value);
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type});
+ } else {
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+ }
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory64(const IR::U32& vaddr, const IR::U32& value_lo, const IR::U32& value_hi, IR::AccType acc_type) {
+ if (current_location.EFlag()) {
+ const auto vlo = ByteReverseWord(value_lo);
+ const auto vhi = ByteReverseWord(value_hi);
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, Pack2x32To1x64(vlo, vhi), IR::Value{acc_type});
+ } else {
+ return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, Pack2x32To1x64(value_lo, value_hi), IR::Value{acc_type});
+ }
+}
+
+void IREmitter::CoprocInternalOperation(size_t coproc_no, bool two, size_t opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, size_t opc2) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(opc1),
+ static_cast<u8>(CRd),
+ static_cast<u8>(CRn),
+ static_cast<u8>(CRm),
+ static_cast<u8>(opc2)};
+ Inst(Opcode::A32CoprocInternalOperation, IR::Value(coproc_info));
+}
+
+void IREmitter::CoprocSendOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2, const IR::U32& word) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(opc1),
+ static_cast<u8>(CRn),
+ static_cast<u8>(CRm),
+ static_cast<u8>(opc2)};
+ Inst(Opcode::A32CoprocSendOneWord, IR::Value(coproc_info), word);
+}
+
+void IREmitter::CoprocSendTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm, const IR::U32& word1, const IR::U32& word2) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(opc),
+ static_cast<u8>(CRm)};
+ Inst(Opcode::A32CoprocSendTwoWords, IR::Value(coproc_info), word1, word2);
+}
+
+IR::U32 IREmitter::CoprocGetOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(opc1),
+ static_cast<u8>(CRn),
+ static_cast<u8>(CRm),
+ static_cast<u8>(opc2)};
+ return Inst<IR::U32>(Opcode::A32CoprocGetOneWord, IR::Value(coproc_info));
+}
+
+IR::U64 IREmitter::CoprocGetTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(opc),
+ static_cast<u8>(CRm)};
+ return Inst<IR::U64>(Opcode::A32CoprocGetTwoWords, IR::Value(coproc_info));
+}
+
+void IREmitter::CoprocLoadWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(long_transfer ? 1 : 0),
+ static_cast<u8>(CRd),
+ static_cast<u8>(has_option ? 1 : 0),
+ static_cast<u8>(option)};
+ Inst(Opcode::A32CoprocLoadWords, IR::Value(coproc_info), address);
+}
+
+void IREmitter::CoprocStoreWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option) {
+ ASSERT(coproc_no <= 15);
+ const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no),
+ static_cast<u8>(two ? 1 : 0),
+ static_cast<u8>(long_transfer ? 1 : 0),
+ static_cast<u8>(CRd),
+ static_cast<u8>(has_option ? 1 : 0),
+ static_cast<u8>(option)};
+ Inst(Opcode::A32CoprocStoreWords, IR::Value(coproc_info), address);
+}
+
+IR::U64 IREmitter::ImmCurrentLocationDescriptor() {
+ return Imm64(IR::LocationDescriptor{current_location}.Value());
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h
new file mode 100644
index 0000000000..9fde4f8775
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h
@@ -0,0 +1,116 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <utility>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/ir/ir_emitter.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::A32 {
+
+enum class ArchVersion;
+enum class CoprocReg;
+enum class Exception;
+enum class ExtReg;
+enum class Reg;
+
+/**
+ * Convenience class to construct a basic block of the intermediate representation.
+ * `block` is the resulting block.
+ * The user of this class updates `current_location` as appropriate.
+ */
+class IREmitter : public IR::IREmitter {
+public:
+ IREmitter(IR::Block& block, LocationDescriptor descriptor, ArchVersion arch_version)
+ : IR::IREmitter(block), current_location(descriptor), arch_version(arch_version) {}
+
+ LocationDescriptor current_location;
+
+ size_t ArchVersion() const;
+
+ u32 PC() const;
+ u32 AlignPC(size_t alignment) const;
+
+ IR::U32 GetRegister(Reg source_reg);
+ IR::U32U64 GetExtendedRegister(ExtReg source_reg);
+ IR::U128 GetVector(ExtReg source_reg);
+ void SetRegister(Reg dest_reg, const IR::U32& value);
+ void SetExtendedRegister(ExtReg dest_reg, const IR::U32U64& value);
+ void SetVector(ExtReg dest_reg, const IR::U128& value);
+
+ void ALUWritePC(const IR::U32& value);
+ void BranchWritePC(const IR::U32& value);
+ void BXWritePC(const IR::U32& value);
+ void LoadWritePC(const IR::U32& value);
+ void UpdateUpperLocationDescriptor();
+
+ void CallSupervisor(const IR::U32& value);
+ void ExceptionRaised(Exception exception);
+
+ IR::U32 GetCpsr();
+ void SetCpsr(const IR::U32& value);
+ void SetCpsrNZCV(const IR::NZCV& value);
+ void SetCpsrNZCVRaw(const IR::U32& value);
+ void SetCpsrNZCVQ(const IR::U32& value);
+ void SetCheckBit(const IR::U1& value);
+ IR::U1 GetOverflowFrom(const IR::Value& value);
+ IR::U1 GetCFlag();
+ void OrQFlag(const IR::U1& value);
+ IR::U32 GetGEFlags();
+ void SetGEFlags(const IR::U32& value);
+ void SetGEFlagsCompressed(const IR::U32& value);
+
+ IR::NZCV NZFrom(const IR::Value& value);
+ void SetCpsrNZ(const IR::NZCV& nz);
+ void SetCpsrNZC(const IR::NZCV& nz, const IR::U1& c);
+
+ void DataSynchronizationBarrier();
+ void DataMemoryBarrier();
+ void InstructionSynchronizationBarrier();
+
+ IR::U32 GetFpscr();
+ void SetFpscr(const IR::U32& new_fpscr);
+ IR::U32 GetFpscrNZCV();
+ void SetFpscrNZCV(const IR::NZCV& new_fpscr_nzcv);
+
+ void ClearExclusive();
+ IR::UAny ReadMemory(size_t bitsize, const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U8 ReadMemory8(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U16 ReadMemory16(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U32 ReadMemory32(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U64 ReadMemory64(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U8 ExclusiveReadMemory8(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U16 ExclusiveReadMemory16(const IR::U32& vaddr, IR::AccType acc_type);
+ IR::U32 ExclusiveReadMemory32(const IR::U32& vaddr, IR::AccType acc_type);
+ std::pair<IR::U32, IR::U32> ExclusiveReadMemory64(const IR::U32& vaddr, IR::AccType acc_type);
+ void WriteMemory(size_t bitsize, const IR::U32& vaddr, const IR::UAny& value, IR::AccType acc_type);
+ void WriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type);
+ void WriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type);
+ void WriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type);
+ void WriteMemory64(const IR::U32& vaddr, const IR::U64& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory64(const IR::U32& vaddr, const IR::U32& value_lo, const IR::U32& value_hi, IR::AccType acc_type);
+
+ void CoprocInternalOperation(size_t coproc_no, bool two, size_t opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, size_t opc2);
+ void CoprocSendOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2, const IR::U32& word);
+ void CoprocSendTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm, const IR::U32& word1, const IR::U32& word2);
+ IR::U32 CoprocGetOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2);
+ IR::U64 CoprocGetTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm);
+ void CoprocLoadWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option);
+ void CoprocStoreWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option);
+
+private:
+ enum ArchVersion arch_version;
+ IR::U64 ImmCurrentLocationDescriptor();
+};
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp
new file mode 100644
index 0000000000..ac3a374e0d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp
@@ -0,0 +1,21 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+
+#include <fmt/format.h>
+
+namespace Dynarmic::A32 {
+
+std::string ToString(const LocationDescriptor& descriptor) {
+ return fmt::format("{{{:08x},{},{},{:08x}{}}}",
+ descriptor.PC(),
+ descriptor.TFlag() ? "T" : "!T",
+ descriptor.EFlag() ? "E" : "!E",
+ descriptor.FPSCR().Value(),
+ descriptor.SingleStepping() ? ",step" : "");
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h
new file mode 100644
index 0000000000..c53e75d4b7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h
@@ -0,0 +1,162 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include <fmt/format.h>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/FPSCR.h"
+#include "dynarmic/frontend/A32/ITState.h"
+#include "dynarmic/frontend/A32/PSR.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::A32 {
+
+/**
+ * LocationDescriptor describes the location of a basic block.
+ * The location is not solely based on the PC because other flags influence the way
+ * instructions should be translated. The CPSR.T flag is most notable since it
+ * tells us if the processor is in Thumb or Arm mode.
+ */
+class LocationDescriptor {
+public:
+ // Indicates bits that should be preserved within descriptors.
+ static constexpr u32 CPSR_MODE_MASK = 0x0600FE20;
+ static constexpr u32 FPSCR_MODE_MASK = 0x07F70000;
+
+ LocationDescriptor(u32 arm_pc, PSR cpsr, FPSCR fpscr, bool single_stepping = false)
+ : arm_pc(arm_pc)
+ , cpsr(cpsr.Value() & CPSR_MODE_MASK)
+ , fpscr(fpscr.Value() & FPSCR_MODE_MASK)
+ , single_stepping(single_stepping) {}
+
+ explicit LocationDescriptor(const IR::LocationDescriptor& o) {
+ arm_pc = static_cast<u32>(o.Value());
+ cpsr.T((o.Value() >> 32) & 1);
+ cpsr.E((o.Value() >> 32) & 2);
+ fpscr = (o.Value() >> 32) & FPSCR_MODE_MASK;
+ cpsr.IT(ITState{static_cast<u8>(o.Value() >> 40)});
+ single_stepping = (o.Value() >> 32) & 4;
+ }
+
+ u32 PC() const { return arm_pc; }
+ bool TFlag() const { return cpsr.T(); }
+ bool EFlag() const { return cpsr.E(); }
+ ITState IT() const { return cpsr.IT(); }
+
+ A32::PSR CPSR() const { return cpsr; }
+ A32::FPSCR FPSCR() const { return fpscr; }
+
+ bool SingleStepping() const { return single_stepping; }
+
+ bool operator==(const LocationDescriptor& o) const {
+ return std::tie(arm_pc, cpsr, fpscr, single_stepping) == std::tie(o.arm_pc, o.cpsr, o.fpscr, o.single_stepping);
+ }
+
+ bool operator!=(const LocationDescriptor& o) const {
+ return !operator==(o);
+ }
+
+ LocationDescriptor SetPC(u32 new_arm_pc) const {
+ return LocationDescriptor(new_arm_pc, cpsr, fpscr, single_stepping);
+ }
+
+ LocationDescriptor AdvancePC(int amount) const {
+ return LocationDescriptor(static_cast<u32>(arm_pc + amount), cpsr, fpscr, single_stepping);
+ }
+
+ LocationDescriptor SetTFlag(bool new_tflag) const {
+ PSR new_cpsr = cpsr;
+ new_cpsr.T(new_tflag);
+
+ return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping);
+ }
+
+ LocationDescriptor SetEFlag(bool new_eflag) const {
+ PSR new_cpsr = cpsr;
+ new_cpsr.E(new_eflag);
+
+ return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping);
+ }
+
+ LocationDescriptor SetFPSCR(u32 new_fpscr) const {
+ return LocationDescriptor(arm_pc, cpsr, A32::FPSCR{new_fpscr & FPSCR_MODE_MASK}, single_stepping);
+ }
+
+ LocationDescriptor SetIT(ITState new_it) const {
+ PSR new_cpsr = cpsr;
+ new_cpsr.IT(new_it);
+
+ return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping);
+ }
+
+ LocationDescriptor AdvanceIT() const {
+ return SetIT(IT().Advance());
+ }
+
+ LocationDescriptor SetSingleStepping(bool new_single_stepping) const {
+ return LocationDescriptor(arm_pc, cpsr, fpscr, new_single_stepping);
+ }
+
+ u64 UniqueHash() const noexcept {
+ // This value MUST BE UNIQUE.
+ // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint
+ const u64 pc_u64 = arm_pc;
+ const u64 fpscr_u64 = fpscr.Value();
+ const u64 t_u64 = cpsr.T() ? 1 : 0;
+ const u64 e_u64 = cpsr.E() ? 2 : 0;
+ const u64 single_stepping_u64 = single_stepping ? 4 : 0;
+ const u64 it_u64 = u64(cpsr.IT().Value()) << 8;
+ const u64 upper = (fpscr_u64 | t_u64 | e_u64 | single_stepping_u64 | it_u64) << 32;
+ return pc_u64 | upper;
+ }
+
+ operator IR::LocationDescriptor() const {
+ return IR::LocationDescriptor{UniqueHash()};
+ }
+
+private:
+ u32 arm_pc; ///< Current program counter value.
+ PSR cpsr; ///< Current program status register.
+ A32::FPSCR fpscr; ///< Floating point status control register.
+ bool single_stepping;
+};
+
+/**
+ * Provides a string representation of a LocationDescriptor.
+ *
+ * @param descriptor The descriptor to get a string representation of
+ */
+std::string ToString(const LocationDescriptor& descriptor);
+
+} // namespace Dynarmic::A32
+
+namespace std {
+template<>
+struct less<Dynarmic::A32::LocationDescriptor> {
+ bool operator()(const Dynarmic::A32::LocationDescriptor& x, const Dynarmic::A32::LocationDescriptor& y) const noexcept {
+ return x.UniqueHash() < y.UniqueHash();
+ }
+};
+template<>
+struct hash<Dynarmic::A32::LocationDescriptor> {
+ size_t operator()(const Dynarmic::A32::LocationDescriptor& x) const noexcept {
+ return std::hash<u64>()(x.UniqueHash());
+ }
+};
+} // namespace std
+
+template<>
+struct fmt::formatter<Dynarmic::A32::LocationDescriptor> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A32::LocationDescriptor descriptor, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::A32::ToString(descriptor), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp
new file mode 100644
index 0000000000..a47ce0b78b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp
@@ -0,0 +1,60 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/a32_types.h"
+
+#include <array>
+#include <ostream>
+
+#include <mcl/bit/bit_field.hpp>
+
+namespace Dynarmic::A32 {
+
+const char* CondToString(Cond cond, bool explicit_al) {
+ static constexpr std::array cond_strs = {"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"};
+ return (!explicit_al && cond == Cond::AL) ? "" : cond_strs.at(static_cast<size_t>(cond));
+}
+
+const char* RegToString(Reg reg) {
+ static constexpr std::array reg_strs = {"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc"};
+ return reg_strs.at(static_cast<size_t>(reg));
+}
+
+const char* ExtRegToString(ExtReg reg) {
+ static constexpr std::array reg_strs = {
+ "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16",
+ "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
+
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
+ "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+
+ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "q16"};
+ return reg_strs.at(static_cast<size_t>(reg));
+}
+
+const char* CoprocRegToString(CoprocReg reg) {
+ static constexpr std::array reg_strs = {
+ "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8",
+ "c9", "c10", "c11", "c12", "c13", "c14", "c15"};
+ return reg_strs.at(static_cast<size_t>(reg));
+}
+
+std::string RegListToString(RegList reg_list) {
+ std::string ret;
+ bool first_reg = true;
+ for (size_t i = 0; i < 16; i++) {
+ if (mcl::bit::get_bit(i, reg_list)) {
+ if (!first_reg) {
+ ret += ", ";
+ }
+ ret += RegToString(static_cast<Reg>(i));
+ first_reg = false;
+ }
+ }
+ return ret;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h
new file mode 100644
index 0000000000..6f56bea51a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h
@@ -0,0 +1,177 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/interface/A32/coprocessor_util.h"
+#include "dynarmic/ir/cond.h"
+
+namespace Dynarmic::A32 {
+
+using Cond = IR::Cond;
+
+enum class Reg {
+ R0,
+ R1,
+ R2,
+ R3,
+ R4,
+ R5,
+ R6,
+ R7,
+ R8,
+ R9,
+ R10,
+ R11,
+ R12,
+ R13,
+ R14,
+ R15,
+ SP = R13,
+ LR = R14,
+ PC = R15,
+ INVALID_REG = 99
+};
+
+enum class ExtReg {
+ // clang-format off
+ S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31,
+ D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31,
+ Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15
+ // clang-format on
+};
+
+using RegList = u16;
+
+enum class ShiftType {
+ LSL,
+ LSR,
+ ASR,
+ ROR ///< RRX falls under this too
+};
+
+enum class SignExtendRotation {
+ ROR_0, ///< ROR #0 or omitted
+ ROR_8, ///< ROR #8
+ ROR_16, ///< ROR #16
+ ROR_24 ///< ROR #24
+};
+
+const char* CondToString(Cond cond, bool explicit_al = false);
+const char* RegToString(Reg reg);
+const char* ExtRegToString(ExtReg reg);
+const char* CoprocRegToString(CoprocReg reg);
+std::string RegListToString(RegList reg_list);
+
+constexpr bool IsSingleExtReg(ExtReg reg) {
+ return reg >= ExtReg::S0 && reg <= ExtReg::S31;
+}
+
+constexpr bool IsDoubleExtReg(ExtReg reg) {
+ return reg >= ExtReg::D0 && reg <= ExtReg::D31;
+}
+
+constexpr bool IsQuadExtReg(ExtReg reg) {
+ return reg >= ExtReg::Q0 && reg <= ExtReg::Q15;
+}
+
+inline size_t RegNumber(Reg reg) {
+ ASSERT(reg != Reg::INVALID_REG);
+ return static_cast<size_t>(reg);
+}
+
+inline size_t RegNumber(ExtReg reg) {
+ if (IsSingleExtReg(reg)) {
+ return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::S0);
+ }
+
+ if (IsDoubleExtReg(reg)) {
+ return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::D0);
+ }
+
+ if (IsQuadExtReg(reg)) {
+ return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::Q0);
+ }
+
+ ASSERT_FALSE("Invalid extended register");
+}
+
+inline Reg operator+(Reg reg, size_t number) {
+ const size_t new_reg = RegNumber(reg) + number;
+ ASSERT(new_reg <= 15);
+
+ return static_cast<Reg>(new_reg);
+}
+
+inline ExtReg operator+(ExtReg reg, size_t number) {
+ const auto new_reg = static_cast<ExtReg>(static_cast<size_t>(reg) + number);
+
+ ASSERT((IsSingleExtReg(reg) && IsSingleExtReg(new_reg))
+ || (IsDoubleExtReg(reg) && IsDoubleExtReg(new_reg))
+ || (IsQuadExtReg(reg) && IsQuadExtReg(new_reg)));
+
+ return new_reg;
+}
+
+inline ExtReg ToExtRegQ(size_t base, bool bit) {
+ return ExtReg::Q0 + ((base >> 1) + (bit ? 8 : 0));
+}
+
+inline ExtReg ToExtRegD(size_t base, bool bit) {
+ return ExtReg::D0 + (base + (bit ? 16 : 0));
+}
+
+inline ExtReg ToExtRegS(size_t base, bool bit) {
+ return ExtReg::S0 + ((base << 1) + (bit ? 1 : 0));
+}
+
+inline ExtReg ToExtReg(bool sz, size_t base, bool bit) {
+ return sz ? ToExtRegD(base, bit) : ToExtRegS(base, bit);
+}
+
+inline ExtReg ToVector(bool Q, size_t base, bool bit) {
+ return Q ? ToExtRegQ(base, bit) : ToExtRegD(base, bit);
+}
+
+} // namespace Dynarmic::A32
+
+template<>
+struct fmt::formatter<Dynarmic::A32::Reg> : fmt::formatter<const char*> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A32::Reg reg, FormatContext& ctx) const {
+ return formatter<const char*>::format(Dynarmic::A32::RegToString(reg), ctx);
+ }
+};
+
+template<>
+struct fmt::formatter<Dynarmic::A32::ExtReg> : fmt::formatter<const char*> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A32::ExtReg reg, FormatContext& ctx) const {
+ return formatter<const char*>::format(Dynarmic::A32::ExtRegToString(reg), ctx);
+ }
+};
+
+template<>
+struct fmt::formatter<Dynarmic::A32::CoprocReg> : fmt::formatter<const char*> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A32::CoprocReg reg, FormatContext& ctx) const {
+ return formatter<const char*>::format(Dynarmic::A32::CoprocRegToString(reg), ctx);
+ }
+};
+
+template<>
+struct fmt::formatter<Dynarmic::A32::RegList> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A32::RegList reg_list, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::A32::RegListToString(reg_list), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h
new file mode 100644
index 0000000000..16ae52e13a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h
@@ -0,0 +1,74 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ *
+ * Original version of table by Lioncash.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <vector>
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A32 {
+
+template<typename Visitor>
+using ArmMatcher = Decoder::Matcher<Visitor, u32>;
+
+template<typename Visitor>
+using ArmDecodeTable = std::array<std::vector<ArmMatcher<Visitor>>, 0x1000>;
+
+namespace detail {
+inline size_t ToFastLookupIndexArm(u32 instruction) {
+ return ((instruction >> 4) & 0x00F) | ((instruction >> 16) & 0xFF0);
+}
+} // namespace detail
+
+template<typename V>
+ArmDecodeTable<V> GetArmDecodeTable() {
+ std::vector<ArmMatcher<V>> list = {
+
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(ArmMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)),
+#include "./arm.inc"
+#undef INST
+
+ };
+
+ // If a matcher has more bits in its mask it is more specific, so it should come first.
+ std::stable_sort(list.begin(), list.end(), [](const auto& matcher1, const auto& matcher2) {
+ return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask());
+ });
+
+ ArmDecodeTable<V> table{};
+ for (size_t i = 0; i < table.size(); ++i) {
+ for (auto matcher : list) {
+ const auto expect = detail::ToFastLookupIndexArm(matcher.GetExpected());
+ const auto mask = detail::ToFastLookupIndexArm(matcher.GetMask());
+ if ((i & mask) == expect) {
+ table[i].push_back(matcher);
+ }
+ }
+ }
+ return table;
+}
+
+template<typename V>
+std::optional<std::reference_wrapper<const ArmMatcher<V>>> DecodeArm(u32 instruction) {
+ static const auto table = GetArmDecodeTable<V>();
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ const auto& subtable = table[detail::ToFastLookupIndexArm(instruction)];
+ auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction);
+ return iter != subtable.end() ? std::optional<std::reference_wrapper<const ArmMatcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc
new file mode 100644
index 0000000000..90995c4c05
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc
@@ -0,0 +1,316 @@
+// Barrier instructions
+INST(arm_DMB, "DMB", "1111010101111111111100000101oooo") // v7
+INST(arm_DSB, "DSB", "1111010101111111111100000100oooo") // v7
+INST(arm_ISB, "ISB", "1111010101111111111100000110oooo") // v7
+
+// Branch instructions
+INST(arm_BLX_imm, "BLX (imm)", "1111101hvvvvvvvvvvvvvvvvvvvvvvvv") // v5
+INST(arm_BLX_reg, "BLX (reg)", "cccc000100101111111111110011mmmm") // v5
+INST(arm_B, "B", "cccc1010vvvvvvvvvvvvvvvvvvvvvvvv") // v1
+INST(arm_BL, "BL", "cccc1011vvvvvvvvvvvvvvvvvvvvvvvv") // v1
+INST(arm_BX, "BX", "cccc000100101111111111110001mmmm") // v4T
+INST(arm_BXJ, "BXJ", "cccc000100101111111111110010mmmm") // v5J
+
+// CRC32 instructions
+INST(arm_CRC32, "CRC32", "cccc00010zz0nnnndddd00000100mmmm") // v8
+INST(arm_CRC32C, "CRC32C", "cccc00010zz0nnnndddd00100100mmmm") // v8
+
+// Coprocessor instructions
+INST(arm_CDP, "CDP", "cccc1110ooooNNNNDDDDppppooo0MMMM") // v2 (CDP2: v5)
+INST(arm_LDC, "LDC", "cccc110pudw1nnnnDDDDppppvvvvvvvv") // v2 (LDC2: v5)
+INST(arm_MCR, "MCR", "cccc1110ooo0NNNNttttppppooo1MMMM") // v2 (MCR2: v5)
+INST(arm_MCRR, "MCRR", "cccc11000100uuuuttttppppooooMMMM") // v5E (MCRR2: v6)
+INST(arm_MRC, "MRC", "cccc1110ooo1NNNNttttppppooo1MMMM") // v2 (MRC2: v5)
+INST(arm_MRRC, "MRRC", "cccc11000101uuuuttttppppooooMMMM") // v5E (MRRC2: v6)
+INST(arm_STC, "STC", "cccc110pudw0nnnnDDDDppppvvvvvvvv") // v2 (STC2: v5)
+
+// Data Processing instructions
+INST(arm_ADC_imm, "ADC (imm)", "cccc0010101Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_ADC_reg, "ADC (reg)", "cccc0000101Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_ADC_rsr, "ADC (rsr)", "cccc0000101Snnnnddddssss0rr1mmmm") // v1
+INST(arm_ADD_imm, "ADD (imm)", "cccc0010100Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_ADD_reg, "ADD (reg)", "cccc0000100Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_ADD_rsr, "ADD (rsr)", "cccc0000100Snnnnddddssss0rr1mmmm") // v1
+INST(arm_AND_imm, "AND (imm)", "cccc0010000Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_AND_reg, "AND (reg)", "cccc0000000Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_AND_rsr, "AND (rsr)", "cccc0000000Snnnnddddssss0rr1mmmm") // v1
+INST(arm_BIC_imm, "BIC (imm)", "cccc0011110Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_BIC_reg, "BIC (reg)", "cccc0001110Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_BIC_rsr, "BIC (rsr)", "cccc0001110Snnnnddddssss0rr1mmmm") // v1
+INST(arm_CMN_imm, "CMN (imm)", "cccc00110111nnnn0000rrrrvvvvvvvv") // v1
+INST(arm_CMN_reg, "CMN (reg)", "cccc00010111nnnn0000vvvvvrr0mmmm") // v1
+INST(arm_CMN_rsr, "CMN (rsr)", "cccc00010111nnnn0000ssss0rr1mmmm") // v1
+INST(arm_CMP_imm, "CMP (imm)", "cccc00110101nnnn0000rrrrvvvvvvvv") // v1
+INST(arm_CMP_reg, "CMP (reg)", "cccc00010101nnnn0000vvvvvrr0mmmm") // v1
+INST(arm_CMP_rsr, "CMP (rsr)", "cccc00010101nnnn0000ssss0rr1mmmm") // v1
+INST(arm_EOR_imm, "EOR (imm)", "cccc0010001Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_EOR_reg, "EOR (reg)", "cccc0000001Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_EOR_rsr, "EOR (rsr)", "cccc0000001Snnnnddddssss0rr1mmmm") // v1
+INST(arm_MOV_imm, "MOV (imm)", "cccc0011101S0000ddddrrrrvvvvvvvv") // v1
+INST(arm_MOV_reg, "MOV (reg)", "cccc0001101S0000ddddvvvvvrr0mmmm") // v1
+INST(arm_MOV_rsr, "MOV (rsr)", "cccc0001101S0000ddddssss0rr1mmmm") // v1
+INST(arm_MVN_imm, "MVN (imm)", "cccc0011111S0000ddddrrrrvvvvvvvv") // v1
+INST(arm_MVN_reg, "MVN (reg)", "cccc0001111S0000ddddvvvvvrr0mmmm") // v1
+INST(arm_MVN_rsr, "MVN (rsr)", "cccc0001111S0000ddddssss0rr1mmmm") // v1
+INST(arm_ORR_imm, "ORR (imm)", "cccc0011100Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_ORR_reg, "ORR (reg)", "cccc0001100Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_ORR_rsr, "ORR (rsr)", "cccc0001100Snnnnddddssss0rr1mmmm") // v1
+INST(arm_RSB_imm, "RSB (imm)", "cccc0010011Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_RSB_reg, "RSB (reg)", "cccc0000011Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_RSB_rsr, "RSB (rsr)", "cccc0000011Snnnnddddssss0rr1mmmm") // v1
+INST(arm_RSC_imm, "RSC (imm)", "cccc0010111Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_RSC_reg, "RSC (reg)", "cccc0000111Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_RSC_rsr, "RSC (rsr)", "cccc0000111Snnnnddddssss0rr1mmmm") // v1
+INST(arm_SBC_imm, "SBC (imm)", "cccc0010110Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_SBC_reg, "SBC (reg)", "cccc0000110Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_SBC_rsr, "SBC (rsr)", "cccc0000110Snnnnddddssss0rr1mmmm") // v1
+INST(arm_SUB_imm, "SUB (imm)", "cccc0010010Snnnnddddrrrrvvvvvvvv") // v1
+INST(arm_SUB_reg, "SUB (reg)", "cccc0000010Snnnnddddvvvvvrr0mmmm") // v1
+INST(arm_SUB_rsr, "SUB (rsr)", "cccc0000010Snnnnddddssss0rr1mmmm") // v1
+INST(arm_TEQ_imm, "TEQ (imm)", "cccc00110011nnnn0000rrrrvvvvvvvv") // v1
+INST(arm_TEQ_reg, "TEQ (reg)", "cccc00010011nnnn0000vvvvvrr0mmmm") // v1
+INST(arm_TEQ_rsr, "TEQ (rsr)", "cccc00010011nnnn0000ssss0rr1mmmm") // v1
+INST(arm_TST_imm, "TST (imm)", "cccc00110001nnnn0000rrrrvvvvvvvv") // v1
+INST(arm_TST_reg, "TST (reg)", "cccc00010001nnnn0000vvvvvrr0mmmm") // v1
+INST(arm_TST_rsr, "TST (rsr)", "cccc00010001nnnn0000ssss0rr1mmmm") // v1
+
+// Exception Generating instructions
+INST(arm_BKPT, "BKPT", "cccc00010010vvvvvvvvvvvv0111vvvv") // v5
+INST(arm_SVC, "SVC", "cccc1111vvvvvvvvvvvvvvvvvvvvvvvv") // v1
+INST(arm_UDF, "UDF", "111001111111------------1111----")
+
+// Extension instructions
+INST(arm_SXTB, "SXTB", "cccc011010101111ddddrr000111mmmm") // v6
+INST(arm_SXTB16, "SXTB16", "cccc011010001111ddddrr000111mmmm") // v6
+INST(arm_SXTH, "SXTH", "cccc011010111111ddddrr000111mmmm") // v6
+INST(arm_SXTAB, "SXTAB", "cccc01101010nnnnddddrr000111mmmm") // v6
+INST(arm_SXTAB16, "SXTAB16", "cccc01101000nnnnddddrr000111mmmm") // v6
+INST(arm_SXTAH, "SXTAH", "cccc01101011nnnnddddrr000111mmmm") // v6
+INST(arm_UXTB, "UXTB", "cccc011011101111ddddrr000111mmmm") // v6
+INST(arm_UXTB16, "UXTB16", "cccc011011001111ddddrr000111mmmm") // v6
+INST(arm_UXTH, "UXTH", "cccc011011111111ddddrr000111mmmm") // v6
+INST(arm_UXTAB, "UXTAB", "cccc01101110nnnnddddrr000111mmmm") // v6
+INST(arm_UXTAB16, "UXTAB16", "cccc01101100nnnnddddrr000111mmmm") // v6
+INST(arm_UXTAH, "UXTAH", "cccc01101111nnnnddddrr000111mmmm") // v6
+
+// Hint instructions
+INST(arm_PLD_imm, "PLD (imm)", "11110101uz01nnnn1111iiiiiiiiiiii") // v5E for PLD; v7 for PLDW
+INST(arm_PLD_reg, "PLD (reg)", "11110111uz01nnnn1111iiiiitt0mmmm") // v5E for PLD; v7 for PLDW
+INST(arm_SEV, "SEV", "----0011001000001111000000000100") // v6K
+INST(arm_SEVL, "SEVL", "----0011001000001111000000000101") // v8
+INST(arm_WFE, "WFE", "----0011001000001111000000000010") // v6K
+INST(arm_WFI, "WFI", "----0011001000001111000000000011") // v6K
+INST(arm_YIELD, "YIELD", "----0011001000001111000000000001") // v6K
+INST(arm_NOP, "Reserved Hint", "----0011001000001111------------")
+INST(arm_NOP, "Reserved Hint", "----001100100000111100000000----")
+
+// Synchronization Primitive instructions
+INST(arm_CLREX, "CLREX", "11110101011111111111000000011111") // v6K
+INST(arm_SWP, "SWP", "cccc00010000nnnntttt00001001uuuu") // v2S (v6: Deprecated)
+INST(arm_SWPB, "SWPB", "cccc00010100nnnntttt00001001uuuu") // v2S (v6: Deprecated)
+INST(arm_STL, "STL", "cccc00011000nnnn111111001001tttt") // v8
+INST(arm_STLEX, "STLEX", "cccc00011000nnnndddd11101001tttt") // v8
+INST(arm_STREX, "STREX", "cccc00011000nnnndddd11111001mmmm") // v6
+INST(arm_LDA, "LDA", "cccc00011001nnnndddd110010011111") // v8
+INST(arm_LDAEX, "LDAEX", "cccc00011001nnnndddd111010011111") // v8
+INST(arm_LDREX, "LDREX", "cccc00011001nnnndddd111110011111") // v6
+INST(arm_STLEXD, "STLEXD", "cccc00011010nnnndddd11101001mmmm") // v8
+INST(arm_STREXD, "STREXD", "cccc00011010nnnndddd11111001mmmm") // v6K
+INST(arm_LDAEXD, "LDAEXD", "cccc00011011nnnndddd111010011111") // v8
+INST(arm_LDREXD, "LDREXD", "cccc00011011nnnndddd111110011111") // v6K
+INST(arm_STLB, "STLB", "cccc00011100nnnn111111001001tttt") // v8
+INST(arm_STLEXB, "STLEXB", "cccc00011100nnnndddd11101001mmmm") // v8
+INST(arm_STREXB, "STREXB", "cccc00011100nnnndddd11111001mmmm") // v6K
+INST(arm_LDAB, "LDAB", "cccc00011101nnnndddd110010011111") // v8
+INST(arm_LDAEXB, "LDAEXB", "cccc00011101nnnndddd111010011111") // v8
+INST(arm_LDREXB, "LDREXB", "cccc00011101nnnndddd111110011111") // v6K
+INST(arm_STLH, "STLH", "cccc00011110nnnn111111001001mmmm") // v8
+INST(arm_STLEXH, "STLEXH", "cccc00011110nnnndddd11101001mmmm") // v8
+INST(arm_STREXH, "STREXH", "cccc00011110nnnndddd11111001mmmm") // v6K
+INST(arm_LDAH, "LDAH", "cccc00011111nnnndddd110010011111") // v8
+INST(arm_LDAEXH, "LDAEXH", "cccc00011111nnnndddd111010011111") // v8
+INST(arm_LDREXH, "LDREXH", "cccc00011111nnnndddd111110011111") // v6K
+
+// Load/Store instructions
+INST(arm_LDRBT, "LDRBT (A1)", "----0100-111--------------------") // v1
+INST(arm_LDRBT, "LDRBT (A2)", "----0110-111---------------0----") // v1
+INST(arm_LDRHT, "LDRHT (A1)", "----0000-111------------1011----") // v6T2
+INST(arm_LDRHT, "LDRHT (A1)", "----0000-1111111--------1011----") // v6T2
+INST(arm_LDRHT, "LDRHT (A2)", "----0000-011--------00001011----") // v6T2
+INST(arm_LDRSBT, "LDRSBT (A1)", "----0000-111------------1101----") // v6T2
+INST(arm_LDRSBT, "LDRSBT (A2)", "----0000-011--------00001101----") // v6T2
+INST(arm_LDRSHT, "LDRSHT (A1)", "----0000-111------------1111----") // v6T2
+INST(arm_LDRSHT, "LDRSHT (A2)", "----0000-011--------00001111----") // v6T2
+INST(arm_LDRT, "LDRT (A1)", "----0100-011--------------------") // v1
+INST(arm_LDRT, "LDRT (A2)", "----0110-011---------------0----") // v1
+INST(arm_STRBT, "STRBT (A1)", "----0100-110--------------------") // v1
+INST(arm_STRBT, "STRBT (A2)", "----0110-110---------------0----") // v1
+INST(arm_STRHT, "STRHT (A1)", "----0000-110------------1011----") // v6T2
+INST(arm_STRHT, "STRHT (A2)", "----0000-010--------00001011----") // v6T2
+INST(arm_STRT, "STRT (A1)", "----0100-010--------------------") // v1
+INST(arm_STRT, "STRT (A2)", "----0110-010---------------0----") // v1
+INST(arm_LDR_lit, "LDR (lit)", "cccc0101u0011111ttttvvvvvvvvvvvv") // v1
+INST(arm_LDR_imm, "LDR (imm)", "cccc010pu0w1nnnnttttvvvvvvvvvvvv") // v1
+INST(arm_LDR_reg, "LDR (reg)", "cccc011pu0w1nnnnttttvvvvvrr0mmmm") // v1
+INST(arm_LDRB_lit, "LDRB (lit)", "cccc0101u1011111ttttvvvvvvvvvvvv") // v1
+INST(arm_LDRB_imm, "LDRB (imm)", "cccc010pu1w1nnnnttttvvvvvvvvvvvv") // v1
+INST(arm_LDRB_reg, "LDRB (reg)", "cccc011pu1w1nnnnttttvvvvvrr0mmmm") // v1
+INST(arm_LDRD_lit, "LDRD (lit)", "cccc0001u1001111ttttvvvv1101vvvv") // v5E
+INST(arm_LDRD_imm, "LDRD (imm)", "cccc000pu1w0nnnnttttvvvv1101vvvv") // v5E
+INST(arm_LDRD_reg, "LDRD (reg)", "cccc000pu0w0nnnntttt00001101mmmm") // v5E
+INST(arm_LDRH_lit, "LDRH (lit)", "cccc000pu1w11111ttttvvvv1011vvvv") // v4
+INST(arm_LDRH_imm, "LDRH (imm)", "cccc000pu1w1nnnnttttvvvv1011vvvv") // v4
+INST(arm_LDRH_reg, "LDRH (reg)", "cccc000pu0w1nnnntttt00001011mmmm") // v4
+INST(arm_LDRSB_lit, "LDRSB (lit)", "cccc0001u1011111ttttvvvv1101vvvv") // v4
+INST(arm_LDRSB_imm, "LDRSB (imm)", "cccc000pu1w1nnnnttttvvvv1101vvvv") // v4
+INST(arm_LDRSB_reg, "LDRSB (reg)", "cccc000pu0w1nnnntttt00001101mmmm") // v4
+INST(arm_LDRSH_lit, "LDRSH (lit)", "cccc0001u1011111ttttvvvv1111vvvv") // v4
+INST(arm_LDRSH_imm, "LDRSH (imm)", "cccc000pu1w1nnnnttttvvvv1111vvvv") // v4
+INST(arm_LDRSH_reg, "LDRSH (reg)", "cccc000pu0w1nnnntttt00001111mmmm") // v4
+INST(arm_STR_imm, "STR (imm)", "cccc010pu0w0nnnnttttvvvvvvvvvvvv") // v1
+INST(arm_STR_reg, "STR (reg)", "cccc011pu0w0nnnnttttvvvvvrr0mmmm") // v1
+INST(arm_STRB_imm, "STRB (imm)", "cccc010pu1w0nnnnttttvvvvvvvvvvvv") // v1
+INST(arm_STRB_reg, "STRB (reg)", "cccc011pu1w0nnnnttttvvvvvrr0mmmm") // v1
+INST(arm_STRD_imm, "STRD (imm)", "cccc000pu1w0nnnnttttvvvv1111vvvv") // v5E
+INST(arm_STRD_reg, "STRD (reg)", "cccc000pu0w0nnnntttt00001111mmmm") // v5E
+INST(arm_STRH_imm, "STRH (imm)", "cccc000pu1w0nnnnttttvvvv1011vvvv") // v4
+INST(arm_STRH_reg, "STRH (reg)", "cccc000pu0w0nnnntttt00001011mmmm") // v4
+
+// Load/Store Multiple instructions
+INST(arm_LDM, "LDM", "cccc100010w1nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_LDMDA, "LDMDA", "cccc100000w1nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_LDMDB, "LDMDB", "cccc100100w1nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_LDMIB, "LDMIB", "cccc100110w1nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_LDM_usr, "LDM (usr reg)", "----100--101--------------------") // v1
+INST(arm_LDM_eret, "LDM (exce ret)", "----100--1-1----1---------------") // v1
+INST(arm_STM, "STM", "cccc100010w0nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_STMDA, "STMDA", "cccc100000w0nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_STMDB, "STMDB", "cccc100100w0nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_STMIB, "STMIB", "cccc100110w0nnnnxxxxxxxxxxxxxxxx") // v1
+INST(arm_STM_usr, "STM (usr reg)", "----100--100--------------------") // v1
+
+// Miscellaneous instructions
+INST(arm_BFC, "BFC", "cccc0111110vvvvvddddvvvvv0011111") // v6T2
+INST(arm_BFI, "BFI", "cccc0111110vvvvvddddvvvvv001nnnn") // v6T2
+INST(arm_CLZ, "CLZ", "cccc000101101111dddd11110001mmmm") // v5
+INST(arm_MOVT, "MOVT", "cccc00110100vvvvddddvvvvvvvvvvvv") // v6T2
+INST(arm_MOVW, "MOVW", "cccc00110000vvvvddddvvvvvvvvvvvv") // v6T2
+INST(arm_NOP, "NOP", "----0011001000001111000000000000") // v6K
+INST(arm_SBFX, "SBFX", "cccc0111101wwwwwddddvvvvv101nnnn") // v6T2
+INST(arm_SEL, "SEL", "cccc01101000nnnndddd11111011mmmm") // v6
+INST(arm_UBFX, "UBFX", "cccc0111111wwwwwddddvvvvv101nnnn") // v6T2
+
+// Unsigned Sum of Absolute Differences instructions
+INST(arm_USAD8, "USAD8", "cccc01111000dddd1111mmmm0001nnnn") // v6
+INST(arm_USADA8, "USADA8", "cccc01111000ddddaaaammmm0001nnnn") // v6
+
+// Packing instructions
+INST(arm_PKHBT, "PKHBT", "cccc01101000nnnnddddvvvvv001mmmm") // v6K
+INST(arm_PKHTB, "PKHTB", "cccc01101000nnnnddddvvvvv101mmmm") // v6K
+
+// Reversal instructions
+INST(arm_RBIT, "RBIT", "cccc011011111111dddd11110011mmmm") // v6T2
+INST(arm_REV, "REV", "cccc011010111111dddd11110011mmmm") // v6
+INST(arm_REV16, "REV16", "cccc011010111111dddd11111011mmmm") // v6
+INST(arm_REVSH, "REVSH", "cccc011011111111dddd11111011mmmm") // v6
+
+// Saturation instructions
+INST(arm_SSAT, "SSAT", "cccc0110101vvvvvddddvvvvvr01nnnn") // v6
+INST(arm_SSAT16, "SSAT16", "cccc01101010vvvvdddd11110011nnnn") // v6
+INST(arm_USAT, "USAT", "cccc0110111vvvvvddddvvvvvr01nnnn") // v6
+INST(arm_USAT16, "USAT16", "cccc01101110vvvvdddd11110011nnnn") // v6
+
+// Divide instructions
+INST(arm_SDIV, "SDIV", "cccc01110001dddd1111mmmm0001nnnn") // v7a
+INST(arm_UDIV, "UDIV", "cccc01110011dddd1111mmmm0001nnnn") // v7a
+
+// Multiply (Normal) instructions
+INST(arm_MLA, "MLA", "cccc0000001Sddddaaaammmm1001nnnn") // v2
+INST(arm_MLS, "MLS", "cccc00000110ddddaaaammmm1001nnnn") // v6T2
+INST(arm_MUL, "MUL", "cccc0000000Sdddd0000mmmm1001nnnn") // v2
+
+// Multiply (Long) instructions
+INST(arm_SMLAL, "SMLAL", "cccc0000111Sddddaaaammmm1001nnnn") // v3M
+INST(arm_SMULL, "SMULL", "cccc0000110Sddddaaaammmm1001nnnn") // v3M
+INST(arm_UMAAL, "UMAAL", "cccc00000100ddddaaaammmm1001nnnn") // v6
+INST(arm_UMLAL, "UMLAL", "cccc0000101Sddddaaaammmm1001nnnn") // v3M
+INST(arm_UMULL, "UMULL", "cccc0000100Sddddaaaammmm1001nnnn") // v3M
+
+// Multiply (Halfword) instructions
+INST(arm_SMLALxy, "SMLALXY", "cccc00010100ddddaaaammmm1xy0nnnn") // v5xP
+INST(arm_SMLAxy, "SMLAXY", "cccc00010000ddddaaaammmm1xy0nnnn") // v5xP
+INST(arm_SMULxy, "SMULXY", "cccc00010110dddd0000mmmm1xy0nnnn") // v5xP
+
+// Multiply (Word by Halfword) instructions
+INST(arm_SMLAWy, "SMLAWY", "cccc00010010ddddaaaammmm1y00nnnn") // v5xP
+INST(arm_SMULWy, "SMULWY", "cccc00010010dddd0000mmmm1y10nnnn") // v5xP
+
+// Multiply (Most Significant Word) instructions
+INST(arm_SMMUL, "SMMUL", "cccc01110101dddd1111mmmm00R1nnnn") // v6
+INST(arm_SMMLA, "SMMLA", "cccc01110101ddddaaaammmm00R1nnnn") // v6
+INST(arm_SMMLS, "SMMLS", "cccc01110101ddddaaaammmm11R1nnnn") // v6
+
+// Multiply (Dual) instructions
+INST(arm_SMLAD, "SMLAD", "cccc01110000ddddaaaammmm00M1nnnn") // v6
+INST(arm_SMLALD, "SMLALD", "cccc01110100ddddaaaammmm00M1nnnn") // v6
+INST(arm_SMLSD, "SMLSD", "cccc01110000ddddaaaammmm01M1nnnn") // v6
+INST(arm_SMLSLD, "SMLSLD", "cccc01110100ddddaaaammmm01M1nnnn") // v6
+INST(arm_SMUAD, "SMUAD", "cccc01110000dddd1111mmmm00M1nnnn") // v6
+INST(arm_SMUSD, "SMUSD", "cccc01110000dddd1111mmmm01M1nnnn") // v6
+
+// Parallel Add/Subtract (Modulo) instructions
+INST(arm_SADD8, "SADD8", "cccc01100001nnnndddd11111001mmmm") // v6
+INST(arm_SADD16, "SADD16", "cccc01100001nnnndddd11110001mmmm") // v6
+INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6
+INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6
+INST(arm_SSUB8, "SSUB8", "cccc01100001nnnndddd11111111mmmm") // v6
+INST(arm_SSUB16, "SSUB16", "cccc01100001nnnndddd11110111mmmm") // v6
+INST(arm_UADD8, "UADD8", "cccc01100101nnnndddd11111001mmmm") // v6
+INST(arm_UADD16, "UADD16", "cccc01100101nnnndddd11110001mmmm") // v6
+INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6
+INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6
+INST(arm_USUB8, "USUB8", "cccc01100101nnnndddd11111111mmmm") // v6
+INST(arm_USUB16, "USUB16", "cccc01100101nnnndddd11110111mmmm") // v6
+
+// Parallel Add/Subtract (Saturating) instructions
+INST(arm_QADD8, "QADD8", "cccc01100010nnnndddd11111001mmmm") // v6
+INST(arm_QADD16, "QADD16", "cccc01100010nnnndddd11110001mmmm") // v6
+INST(arm_QASX, "QASX", "cccc01100010nnnndddd11110011mmmm") // v6
+INST(arm_QSAX, "QSAX", "cccc01100010nnnndddd11110101mmmm") // v6
+INST(arm_QSUB8, "QSUB8", "cccc01100010nnnndddd11111111mmmm") // v6
+INST(arm_QSUB16, "QSUB16", "cccc01100010nnnndddd11110111mmmm") // v6
+INST(arm_UQADD8, "UQADD8", "cccc01100110nnnndddd11111001mmmm") // v6
+INST(arm_UQADD16, "UQADD16", "cccc01100110nnnndddd11110001mmmm") // v6
+INST(arm_UQASX, "UQASX", "cccc01100110nnnndddd11110011mmmm") // v6
+INST(arm_UQSAX, "UQSAX", "cccc01100110nnnndddd11110101mmmm") // v6
+INST(arm_UQSUB8, "UQSUB8", "cccc01100110nnnndddd11111111mmmm") // v6
+INST(arm_UQSUB16, "UQSUB16", "cccc01100110nnnndddd11110111mmmm") // v6
+
+// Parallel Add/Subtract (Halving) instructions
+INST(arm_SHADD8, "SHADD8", "cccc01100011nnnndddd11111001mmmm") // v6
+INST(arm_SHADD16, "SHADD16", "cccc01100011nnnndddd11110001mmmm") // v6
+INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6
+INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6
+INST(arm_SHSUB8, "SHSUB8", "cccc01100011nnnndddd11111111mmmm") // v6
+INST(arm_SHSUB16, "SHSUB16", "cccc01100011nnnndddd11110111mmmm") // v6
+INST(arm_UHADD8, "UHADD8", "cccc01100111nnnndddd11111001mmmm") // v6
+INST(arm_UHADD16, "UHADD16", "cccc01100111nnnndddd11110001mmmm") // v6
+INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6
+INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6
+INST(arm_UHSUB8, "UHSUB8", "cccc01100111nnnndddd11111111mmmm") // v6
+INST(arm_UHSUB16, "UHSUB16", "cccc01100111nnnndddd11110111mmmm") // v6
+
+// Saturated Add/Subtract instructions
+INST(arm_QADD, "QADD", "cccc00010000nnnndddd00000101mmmm") // v5xP
+INST(arm_QSUB, "QSUB", "cccc00010010nnnndddd00000101mmmm") // v5xP
+INST(arm_QDADD, "QDADD", "cccc00010100nnnndddd00000101mmmm") // v5xP
+INST(arm_QDSUB, "QDSUB", "cccc00010110nnnndddd00000101mmmm") // v5xP
+
+// Status Register Access instructions
+INST(arm_CPS, "CPS", "111100010000---00000000---0-----") // v6
+INST(arm_SETEND, "SETEND", "1111000100000001000000e000000000") // v6
+INST(arm_MRS, "MRS", "cccc000100001111dddd000000000000") // v3
+INST(arm_MSR_imm, "MSR (imm)", "cccc00110010mmmm1111rrrrvvvvvvvv") // v3
+INST(arm_MSR_reg, "MSR (reg)", "cccc00010010mmmm111100000000nnnn") // v3
+INST(arm_RFE, "RFE", "1111100--0-1----0000101000000000") // v6
+INST(arm_SRS, "SRS", "1111100--1-0110100000101000-----") // v6
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h
new file mode 100644
index 0000000000..cfcd28c6e6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h
@@ -0,0 +1,78 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <set>
+#include <vector>
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A32 {
+
+template<typename Visitor>
+using ASIMDMatcher = Decoder::Matcher<Visitor, u32>;
+
+template<typename V>
+std::vector<ASIMDMatcher<V>> GetASIMDDecodeTable() {
+ std::vector<ASIMDMatcher<V>> table = {
+
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(ASIMDMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)),
+#include "./asimd.inc"
+#undef INST
+
+ };
+
+ // Exceptions to the rule of thumb.
+ const std::set<std::string> comes_first{
+ "VBIC, VMOV, VMVN, VORR (immediate)",
+ "VEXT",
+ "VTBL",
+ "VTBX",
+ "VDUP (scalar)",
+ };
+ const std::set<std::string> comes_last{
+ "VMLA (scalar)",
+ "VMLAL (scalar)",
+ "VQDMLAL/VQDMLSL (scalar)",
+ "VMUL (scalar)",
+ "VMULL (scalar)",
+ "VQDMULL (scalar)",
+ "VQDMULH (scalar)",
+ "VQRDMULH (scalar)",
+ };
+ const auto sort_begin = std::stable_partition(table.begin(), table.end(), [&](const auto& matcher) {
+ return comes_first.count(matcher.GetName()) > 0;
+ });
+ const auto sort_end = std::stable_partition(table.begin(), table.end(), [&](const auto& matcher) {
+ return comes_last.count(matcher.GetName()) == 0;
+ });
+
+ // If a matcher has more bits in its mask it is more specific, so it should come first.
+ std::stable_sort(sort_begin, sort_end, [](const auto& matcher1, const auto& matcher2) {
+ return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask());
+ });
+
+ return table;
+}
+
+template<typename V>
+std::optional<std::reference_wrapper<const ASIMDMatcher<V>>> DecodeASIMD(u32 instruction) {
+ static const auto table = GetASIMDDecodeTable<V>();
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ auto iter = std::find_if(table.begin(), table.end(), matches_instruction);
+ return iter != table.end() ? std::optional<std::reference_wrapper<const ASIMDMatcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc
new file mode 100644
index 0000000000..d0dfd86752
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc
@@ -0,0 +1,172 @@
+// Three registers of the same length
+INST(asimd_VHADD, "VHADD", "1111001U0Dzznnnndddd0000NQM0mmmm") // ASIMD
+INST(asimd_VQADD, "VQADD", "1111001U0Dzznnnndddd0000NQM1mmmm") // ASIMD
+INST(asimd_VRHADD, "VRHADD", "1111001U0Dzznnnndddd0001NQM0mmmm") // ASIMD
+INST(asimd_VAND_reg, "VAND (register)", "111100100D00nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VBIC_reg, "VBIC (register)", "111100100D01nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VORR_reg, "VORR (register)", "111100100D10nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VORN_reg, "VORN (register)", "111100100D11nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VEOR_reg, "VEOR (register)", "111100110D00nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VBSL, "VBSL", "111100110D01nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VBIT, "VBIT", "111100110D10nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VBIF, "VBIF", "111100110D11nnnndddd0001NQM1mmmm") // ASIMD
+INST(asimd_VHSUB, "VHSUB", "1111001U0Dzznnnndddd0010NQM0mmmm") // ASIMD
+INST(asimd_VQSUB, "VQSUB", "1111001U0Dzznnnndddd0010NQM1mmmm") // ASIMD
+INST(asimd_VCGT_reg, "VCGT (register)", "1111001U0Dzznnnndddd0011NQM0mmmm") // ASIMD
+INST(asimd_VCGE_reg, "VCGE (register)", "1111001U0Dzznnnndddd0011NQM1mmmm") // ASIMD
+INST(asimd_VSHL_reg, "VSHL (register)", "1111001U0Dzznnnndddd0100NQM0mmmm") // ASIMD
+INST(asimd_VQSHL_reg, "VQSHL (register)", "1111001U0Dzznnnndddd0100NQM1mmmm") // ASIMD
+INST(asimd_VRSHL, "VRSHL", "1111001U0Dzznnnndddd0101NQM0mmmm") // ASIMD
+//INST(asimd_VQRSHL, "VQRSHL", "1111001U0-CC--------0101---1----") // ASIMD
+INST(asimd_VMAX, "VMAX/VMIN (integer)", "1111001U0Dzznnnnmmmm0110NQMommmm") // ASIMD
+INST(asimd_VABD, "VABD", "1111001U0Dzznnnndddd0111NQM0mmmm") // ASIMD
+INST(asimd_VABA, "VABA", "1111001U0Dzznnnndddd0111NQM1mmmm") // ASIMD
+INST(asimd_VADD_int, "VADD (integer)", "111100100Dzznnnndddd1000NQM0mmmm") // ASIMD
+INST(asimd_VSUB_int, "VSUB (integer)", "111100110Dzznnnndddd1000NQM0mmmm") // ASIMD
+INST(asimd_VTST, "VTST", "111100100Dzznnnndddd1000NQM1mmmm") // ASIMD
+INST(asimd_VCEQ_reg, "VCEG (register)", "111100110Dzznnnndddd1000NQM1mmmm") // ASIMD
+INST(asimd_VMLA, "VMLA/VMLS", "1111001o0Dzznnnndddd1001NQM0mmmm") // ASIMD
+INST(asimd_VMUL, "VMUL", "1111001P0Dzznnnndddd1001NQM1mmmm") // ASIMD
+INST(asimd_VPMAX_int, "VPMAX/VPMIN (integer)", "1111001U0Dzznnnndddd1010NQMommmm") // ASIMD
+INST(v8_VMAXNM, "VMAXNM", "111100110D0znnnndddd1111NQM1mmmm") // v8
+INST(v8_VMINNM, "VMINNM", "111100110D1znnnndddd1111NQM1mmmm") // v8
+INST(asimd_VQDMULH, "VQDMULH", "111100100Dzznnnndddd1011NQM0mmmm") // ASIMD
+INST(asimd_VQRDMULH, "VQRDMULH", "111100110Dzznnnndddd1011NQM0mmmm") // ASIMD
+INST(asimd_VPADD, "VPADD", "111100100Dzznnnndddd1011NQM1mmmm") // ASIMD
+INST(asimd_VFMA, "VFMA", "111100100D0znnnndddd1100NQM1mmmm") // ASIMD
+INST(asimd_VFMS, "VFMS", "111100100D1znnnndddd1100NQM1mmmm") // ASIMD
+INST(asimd_VADD_float, "VADD (floating-point)", "111100100D0znnnndddd1101NQM0mmmm") // ASIMD
+INST(asimd_VSUB_float, "VSUB (floating-point)", "111100100D1znnnndddd1101NQM0mmmm") // ASIMD
+INST(asimd_VPADD_float, "VPADD (floating-point)", "111100110D0znnnndddd1101NQM0mmmm") // ASIMD
+INST(asimd_VABD_float, "VABD (floating-point)", "111100110D1znnnndddd1101NQM0mmmm") // ASIMD
+INST(asimd_VMLA_float, "VMLA (floating-point)", "111100100D0znnnndddd1101NQM1mmmm") // ASIMD
+INST(asimd_VMLS_float, "VMLS (floating-point)", "111100100D1znnnndddd1101NQM1mmmm") // ASIMD
+INST(asimd_VMUL_float, "VMUL (floating-point)", "111100110D0znnnndddd1101NQM1mmmm") // ASIMD
+INST(asimd_VCEQ_reg_float, "VCEQ (register)", "111100100D0znnnndddd1110NQM0mmmm") // ASIMD
+INST(asimd_VCGE_reg_float, "VCGE (register)", "111100110D0znnnndddd1110NQM0mmmm") // ASIMD
+INST(asimd_VCGT_reg_float, "VCGT (register)", "111100110D1znnnndddd1110NQM0mmmm") // ASIMD
+INST(asimd_VACGE, "VACGE", "111100110Doznnnndddd1110NQM1mmmm") // ASIMD
+INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD
+INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD
+INST(asimd_VPMAX_float, "VPMAX (floating-point)", "111100110D0znnnndddd1111NQM0mmmm") // ASIMD
+INST(asimd_VPMIN_float, "VPMIN (floating-point)", "111100110D1znnnndddd1111NQM0mmmm") // ASIMD
+INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
+INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
+INST(v8_SHA256H, "SHA256H", "111100110D00nnnndddd1100NQM0mmmm") // v8
+INST(v8_SHA256H2, "SHA256H2", "111100110D01nnnndddd1100NQM0mmmm") // v8
+INST(v8_SHA256SU1, "SHA256SU1", "111100110D10nnnndddd1100NQM0mmmm") // v8
+
+// Three registers of different lengths
+INST(asimd_VADDL, "VADDL/VADDW", "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD
+INST(asimd_VSUBL, "VSUBL/VSUBW", "1111001U1Dzznnnndddd001oN0M0mmmm") // ASIMD
+//INST(asimd_VADDHN, "VADDHN", "111100101-----------0100-0-0----") // ASIMD
+//INST(asimd_VRADDHN, "VRADDHN", "111100111-----------0100-0-0----") // ASIMD
+INST(asimd_VABAL, "VABAL", "1111001U1Dzznnnndddd0101N0M0mmmm") // ASIMD
+//INST(asimd_VSUBHN, "VSUBHN", "111100101-----------0110-0-0----") // ASIMD
+//INST(asimd_VRSUBHN, "VRSUBHN", "111100111-----------0110-0-0----") // ASIMD
+INST(asimd_VABDL, "VABDL", "1111001U1Dzznnnndddd0111N0M0mmmm") // ASIMD
+INST(asimd_VMLAL, "VMLAL/VMLSL", "1111001U1Dzznnnndddd10o0N0M0mmmm") // ASIMD
+//INST(asimd_VQDMLAL, "VQDMLAL", "111100101-----------10-1-0-0----") // ASIMD
+INST(asimd_VMULL, "VMULL", "1111001U1Dzznnnndddd11P0N0M0mmmm") // ASIMD
+//INST(asimd_VQDMULL, "VQDMULL", "111100101-----------1101-0-0----") // ASIMD
+
+// Two registers and a scalar
+INST(asimd_VMLA_scalar, "VMLA (scalar)", "1111001Q1Dzznnnndddd0o0FN1M0mmmm") // ASIMD
+INST(asimd_VMLAL_scalar, "VMLAL (scalar)", "1111001U1dzznnnndddd0o10N1M0mmmm") // ASIMD
+//INST(asimd_VQDMLAL_scalar, "VQDMLAL/VQDMLSL (scalar)", "111100101-BB--------0x11-1-0----") // ASIMD
+INST(asimd_VMUL_scalar, "VMUL (scalar)", "1111001Q1Dzznnnndddd100FN1M0mmmm") // ASIMD
+INST(asimd_VMULL_scalar, "VMULL (scalar)", "1111001U1Dzznnnndddd1010N1M0mmmm") // ASIMD
+INST(asimd_VQDMULL_scalar, "VQDMULL (scalar)", "111100101Dzznnnndddd1011N1M0mmmm") // ASIMD
+INST(asimd_VQDMULH_scalar, "VQDMULH (scalar)", "1111001Q1Dzznnnndddd1100N1M0mmmm") // ASIMD
+INST(asimd_VQRDMULH_scalar, "VQRDMULH (scalar)", "1111001Q1Dzznnnndddd1101N1M0mmmm") // ASIMD
+
+// Two registers and a shift amount
+INST(asimd_SHR, "SHR", "1111001U1Diiiiiidddd0000LQM1mmmm") // ASIMD
+INST(asimd_SRA, "SRA", "1111001U1Diiiiiidddd0001LQM1mmmm") // ASIMD
+INST(asimd_VRSHR, "VRSHR", "1111001U1Diiiiiidddd0010LQM1mmmm") // ASIMD
+INST(asimd_VRSRA, "VRSRA", "1111001U1Diiiiiidddd0011LQM1mmmm") // ASIMD
+INST(asimd_VSRI, "VSRI", "111100111Diiiiiidddd0100LQM1mmmm") // ASIMD
+INST(asimd_VSHL, "VSHL", "111100101Diiiiiidddd0101LQM1mmmm") // ASIMD
+INST(asimd_VSLI, "VSLI", "111100111Diiiiiidddd0101LQM1mmmm") // ASIMD
+INST(asimd_VQSHL, "VQSHL" , "1111001U1Diiiiiidddd011oLQM1mmmm") // ASIMD
+INST(asimd_VSHRN, "VSHRN", "111100101Diiiiiidddd100000M1mmmm") // ASIMD
+INST(asimd_VRSHRN, "VRSHRN", "111100101Diiiiiidddd100001M1mmmm") // ASIMD
+INST(asimd_VQSHRUN, "VQSHRUN", "111100111Diiiiiidddd100000M1mmmm") // ASIMD
+INST(asimd_VQRSHRUN, "VQRSHRUN", "111100111Diiiiiidddd100001M1mmmm") // ASIMD
+INST(asimd_VQSHRN, "VQSHRN", "1111001U1Diiiiiidddd100100M1mmmm") // ASIMD
+INST(asimd_VQRSHRN, "VQRSHRN", "1111001U1Diiiiiidddd100101M1mmmm") // ASIMD
+INST(asimd_VSHLL, "VSHLL", "1111001U1Diiiiiidddd101000M1mmmm") // ASIMD
+INST(asimd_VCVT_fixed, "VCVT (fixed-point)", "1111001U1Diiiiiidddd111o0QM1mmmm") // ASIMD
+
+// Two registers, miscellaneous
+INST(asimd_VREV, "VREV{16,32,64}", "111100111D11zz00dddd000ooQM0mmmm") // ASIMD
+INST(asimd_VPADDL, "VPADDL", "111100111D11zz00dddd0010oQM0mmmm") // ASIMD
+INST(asimd_VCLS, "VCLS", "111100111D11zz00dddd01000QM0mmmm") // ASIMD
+INST(asimd_VCLZ, "VCLZ", "111100111D11zz00dddd01001QM0mmmm") // ASIMD
+INST(asimd_VCNT, "VCNT", "111100111D11zz00dddd01010QM0mmmm") // ASIMD
+INST(asimd_VMVN_reg, "VMVN_reg", "111100111D11zz00dddd01011QM0mmmm") // ASIMD
+INST(asimd_VPADAL, "VPADAL", "111100111D11zz00dddd0110oQM0mmmm") // ASIMD
+INST(asimd_VQABS, "VQABS", "111100111D11zz00dddd01110QM0mmmm") // ASIMD
+INST(asimd_VQNEG, "VQNEG", "111100111D11zz00dddd01111QM0mmmm") // ASIMD
+INST(asimd_VCGT_zero, "VCGT (zero)", "111100111D11zz01dddd0F000QM0mmmm") // ASIMD
+INST(asimd_VCGE_zero, "VCGE (zero)", "111100111D11zz01dddd0F001QM0mmmm") // ASIMD
+INST(asimd_VCEQ_zero, "VCEQ (zero)", "111100111D11zz01dddd0F010QM0mmmm") // ASIMD
+INST(asimd_VCLE_zero, "VCLE (zero)", "111100111D11zz01dddd0F011QM0mmmm") // ASIMD
+INST(asimd_VCLT_zero, "VCLT (zero)", "111100111D11zz01dddd0F100QM0mmmm") // ASIMD
+INST(arm_UDF, "UNALLOCATED", "111100111-11--01----01101--0----") // v8
+INST(asimd_VABS, "VABS", "111100111D11zz01dddd0F110QM0mmmm") // ASIMD
+INST(asimd_VNEG, "VNEG", "111100111D11zz01dddd0F111QM0mmmm") // ASIMD
+INST(asimd_VSWP, "VSWP", "111100111D110010dddd00000QM0mmmm") // ASIMD
+INST(arm_UDF, "UNALLOCATED", "111100111-11--10----00000--0----") // ASIMD
+INST(asimd_VTRN, "VTRN", "111100111D11zz10dddd00001QM0mmmm") // ASIMD
+INST(asimd_VUZP, "VUZP", "111100111D11zz10dddd00010QM0mmmm") // ASIMD
+INST(asimd_VZIP, "VZIP", "111100111D11zz10dddd00011QM0mmmm") // ASIMD
+INST(asimd_VMOVN, "VMOVN", "111100111D11zz10dddd001000M0mmmm") // ASIMD
+INST(asimd_VQMOVUN, "VQMOVUN", "111100111D11zz10dddd001001M0mmmm") // ASIMD
+INST(asimd_VQMOVN, "VQMOVN", "111100111D11zz10dddd00101oM0mmmm") // ASIMD
+INST(asimd_VSHLL_max, "VSHLL_max", "111100111D11zz10dddd001100M0mmmm") // ASIMD
+INST(v8_VRINTN, "VRINTN", "111100111D11zz10dddd01000QM0mmmm") // v8
+INST(v8_VRINTX, "VRINTX", "111100111D11zz10dddd01001QM0mmmm") // v8
+INST(v8_VRINTA, "VRINTA", "111100111D11zz10dddd01010QM0mmmm") // v8
+INST(v8_VRINTZ, "VRINTZ", "111100111D11zz10dddd01011QM0mmmm") // v8
+INST(v8_VRINTM, "VRINTM", "111100111D11zz10dddd01101QM0mmmm") // v8
+INST(v8_VRINTP, "VRINTP", "111100111D11zz10dddd01111QM0mmmm") // v8
+INST(asimd_VCVT_half, "VCVT (half-precision)", "111100111D11zz10dddd011o00M0mmmm") // ASIMD
+INST(arm_UDF, "UNALLOCATED", "111100111-11--10----011-01-0----") // ASIMD
+INST(v8_VCVTA, "VCVTA", "111100111D11zz11dddd0000oQM0mmmm") // v8
+INST(v8_VCVTN, "VCVTN", "111100111D11zz11dddd0001oQM0mmmm") // v8
+INST(v8_VCVTP, "VCVTP", "111100111D11zz11dddd0010oQM0mmmm") // v8
+INST(v8_VCVTM, "VCVTM", "111100111D11zz11dddd0011oQM0mmmm") // v8
+INST(asimd_VRECPE, "VRECPE", "111100111D11zz11dddd010F0QM0mmmm") // ASIMD
+INST(asimd_VRSQRTE, "VRSQRTE", "111100111D11zz11dddd010F1QM0mmmm") // ASIMD
+INST(asimd_VCVT_integer, "VCVT (integer)", "111100111D11zz11dddd011oUQM0mmmm") // ASIMD
+
+// Two registers, cryptography
+INST(v8_AESE, "AESE", "111100111D11zz00dddd001100M0mmmm") // v8
+INST(v8_AESD, "AESD", "111100111D11zz00dddd001101M0mmmm") // v8
+INST(v8_AESMC, "AESMC", "111100111D11zz00dddd001110M0mmmm") // v8
+INST(v8_AESIMC, "AESIMC", "111100111D11zz00dddd001111M0mmmm") // v8
+INST(arm_UDF, "UNALLOCATED", "111100111-11--01----001010-0----") // v8
+INST(arm_UDF, "UNALLOCATED (SHA1H)", "111100111-11--01----001011-0----") // v8
+INST(arm_UDF, "UNALLOCATED (SHA1SU1)", "111100111-11--10----001110-0----") // v8
+INST(v8_SHA256SU0, "SHA256SU0", "111100111D11zz10dddd001111M0mmmm") // v8
+
+// One register and modified immediate
+INST(asimd_VMOV_imm, "VBIC, VMOV, VMVN, VORR (immediate)", "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD
+
+// Miscellaneous
+INST(asimd_VEXT, "VEXT", "111100101D11nnnnddddiiiiNQM0mmmm") // ASIMD
+INST(asimd_VTBL, "VTBL", "111100111D11nnnndddd10zzN0M0mmmm") // ASIMD
+INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD
+INST(asimd_VDUP_scalar, "VDUP (scalar)", "111100111D11iiiidddd11000QM0mmmm") // ASIMD
+INST(arm_UDF, "UNALLOCATED", "111100111-11--------11-----0----") // ASIMD
+
+// Advanced SIMD load/store structures
+INST(v8_VST_multiple, "VST{1-4} (multiple)", "111101000D00nnnnddddxxxxzzaammmm") // v8
+INST(v8_VLD_multiple, "VLD{1-4} (multiple)", "111101000D10nnnnddddxxxxzzaammmm") // v8
+INST(arm_UDF, "UNALLOCATED", "111101000--0--------1011--------") // v8
+INST(arm_UDF, "UNALLOCATED", "111101000--0--------11----------") // v8
+INST(arm_UDF, "UNALLOCATED", "111101001-00--------11----------") // v8
+INST(v8_VLD_all_lanes, "VLD{1-4} (all lanes)", "111101001D10nnnndddd11nnzzTammmm") // v8
+INST(v8_VST_single, "VST{1-4} (single)", "111101001D00nnnnddddzzNNaaaammmm") // v8
+INST(v8_VLD_single, "VLD{1-4} (single)", "111101001D10nnnnddddzzNNaaaammmm") // v8
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h
new file mode 100644
index 0000000000..6a4275f726
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h
@@ -0,0 +1,39 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A32 {
+
+template<typename Visitor>
+using Thumb16Matcher = Decoder::Matcher<Visitor, u16>;
+
+template<typename V>
+std::optional<std::reference_wrapper<const Thumb16Matcher<V>>> DecodeThumb16(u16 instruction) {
+ static const std::vector<Thumb16Matcher<V>> table = {
+
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Thumb16Matcher, fn, name, Decoder::detail::StringToArray<16>(bitstring)),
+#include "./thumb16.inc"
+#undef INST
+
+ };
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ auto iter = std::find_if(table.begin(), table.end(), matches_instruction);
+ return iter != table.end() ? std::optional<std::reference_wrapper<const Thumb16Matcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc
new file mode 100644
index 0000000000..f01c98aa05
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc
@@ -0,0 +1,98 @@
+// Shift (immediate) add, subtract, move and compare instructions
+INST(thumb16_LSL_imm, "LSL (imm)", "00000vvvvvmmmddd")
+INST(thumb16_LSR_imm, "LSR (imm)", "00001vvvvvmmmddd")
+INST(thumb16_ASR_imm, "ASR (imm)", "00010vvvvvmmmddd")
+INST(thumb16_ADD_reg_t1, "ADD (reg, T1)", "0001100mmmnnnddd")
+INST(thumb16_SUB_reg, "SUB (reg)", "0001101mmmnnnddd")
+INST(thumb16_ADD_imm_t1, "ADD (imm, T1)", "0001110vvvnnnddd")
+INST(thumb16_SUB_imm_t1, "SUB (imm, T1)", "0001111vvvnnnddd")
+INST(thumb16_MOV_imm, "MOV (imm)", "00100dddvvvvvvvv")
+INST(thumb16_CMP_imm, "CMP (imm)", "00101nnnvvvvvvvv")
+INST(thumb16_ADD_imm_t2, "ADD (imm, T2)", "00110dddvvvvvvvv")
+INST(thumb16_SUB_imm_t2, "SUB (imm, T2)", "00111dddvvvvvvvv")
+
+ // Data-processing instructions
+INST(thumb16_AND_reg, "AND (reg)", "0100000000mmmddd")
+INST(thumb16_EOR_reg, "EOR (reg)", "0100000001mmmddd")
+INST(thumb16_LSL_reg, "LSL (reg)", "0100000010mmmddd")
+INST(thumb16_LSR_reg, "LSR (reg)", "0100000011mmmddd")
+INST(thumb16_ASR_reg, "ASR (reg)", "0100000100mmmddd")
+INST(thumb16_ADC_reg, "ADC (reg)", "0100000101mmmddd")
+INST(thumb16_SBC_reg, "SBC (reg)", "0100000110mmmddd")
+INST(thumb16_ROR_reg, "ROR (reg)", "0100000111sssddd")
+INST(thumb16_TST_reg, "TST (reg)", "0100001000mmmnnn")
+INST(thumb16_RSB_imm, "RSB (imm)", "0100001001nnnddd")
+INST(thumb16_CMP_reg_t1, "CMP (reg, T1)", "0100001010mmmnnn")
+INST(thumb16_CMN_reg, "CMN (reg)", "0100001011mmmnnn")
+INST(thumb16_ORR_reg, "ORR (reg)", "0100001100mmmddd")
+INST(thumb16_MUL_reg, "MUL (reg)", "0100001101nnnddd")
+INST(thumb16_BIC_reg, "BIC (reg)", "0100001110mmmddd")
+INST(thumb16_MVN_reg, "MVN (reg)", "0100001111mmmddd")
+
+// Special data instructions
+INST(thumb16_ADD_reg_t2, "ADD (reg, T2)", "01000100Dmmmmddd") // v4T, Low regs: v6T2
+INST(thumb16_CMP_reg_t2, "CMP (reg, T2)", "01000101Nmmmmnnn") // v4T
+INST(thumb16_MOV_reg, "MOV (reg)", "01000110Dmmmmddd") // v4T, Low regs: v6
+
+// Store/Load single data item instructions
+INST(thumb16_LDR_literal, "LDR (literal)", "01001tttvvvvvvvv")
+INST(thumb16_STR_reg, "STR (reg)", "0101000mmmnnnttt")
+INST(thumb16_STRH_reg, "STRH (reg)", "0101001mmmnnnttt")
+INST(thumb16_STRB_reg, "STRB (reg)", "0101010mmmnnnttt")
+INST(thumb16_LDRSB_reg, "LDRSB (reg)", "0101011mmmnnnttt")
+INST(thumb16_LDR_reg, "LDR (reg)", "0101100mmmnnnttt")
+INST(thumb16_LDRH_reg, "LDRH (reg)", "0101101mmmnnnttt")
+INST(thumb16_LDRB_reg, "LDRB (reg)", "0101110mmmnnnttt")
+INST(thumb16_LDRSH_reg, "LDRSH (reg)", "0101111mmmnnnttt")
+INST(thumb16_STR_imm_t1, "STR (imm, T1)", "01100vvvvvnnnttt")
+INST(thumb16_LDR_imm_t1, "LDR (imm, T1)", "01101vvvvvnnnttt")
+INST(thumb16_STRB_imm, "STRB (imm)", "01110vvvvvnnnttt")
+INST(thumb16_LDRB_imm, "LDRB (imm)", "01111vvvvvnnnttt")
+INST(thumb16_STRH_imm, "STRH (imm)", "10000vvvvvnnnttt")
+INST(thumb16_LDRH_imm, "LDRH (imm)", "10001vvvvvnnnttt")
+INST(thumb16_STR_imm_t2, "STR (imm, T2)", "10010tttvvvvvvvv")
+INST(thumb16_LDR_imm_t2, "LDR (imm, T2)", "10011tttvvvvvvvv")
+
+// Generate relative address instructions
+INST(thumb16_ADR, "ADR", "10100dddvvvvvvvv")
+INST(thumb16_ADD_sp_t1, "ADD (SP plus imm, T1)", "10101dddvvvvvvvv")
+INST(thumb16_ADD_sp_t2, "ADD (SP plus imm, T2)", "101100000vvvvvvv") // v4T
+INST(thumb16_SUB_sp, "SUB (SP minus imm)", "101100001vvvvvvv") // v4T
+
+// Hint instructions
+INST(thumb16_SEV, "SEV", "1011111101000000") // v7
+INST(thumb16_SEVL, "SEVL", "1011111101010000") // v8
+INST(thumb16_WFE, "WFE", "1011111100100000") // v7
+INST(thumb16_WFI, "WFI", "1011111100110000") // v7
+INST(thumb16_YIELD, "YIELD", "1011111100010000") // v7
+INST(thumb16_NOP, "NOP", "10111111----0000") // v7
+
+// IT instruction
+INST(thumb16_IT, "IT", "10111111iiiiiiii") // v7
+
+// Miscellaneous 16-bit instructions
+INST(thumb16_SXTH, "SXTH", "1011001000mmmddd") // v6
+INST(thumb16_SXTB, "SXTB", "1011001001mmmddd") // v6
+INST(thumb16_UXTH, "UXTH", "1011001010mmmddd") // v6
+INST(thumb16_UXTB, "UXTB", "1011001011mmmddd") // v6
+INST(thumb16_PUSH, "PUSH", "1011010Mxxxxxxxx") // v4T
+INST(thumb16_POP, "POP", "1011110Pxxxxxxxx") // v4T
+INST(thumb16_SETEND, "SETEND", "101101100101x000") // v6
+INST(thumb16_CPS, "CPS", "10110110011m0aif") // v6
+INST(thumb16_REV, "REV", "1011101000mmmddd") // v6
+INST(thumb16_REV16, "REV16", "1011101001mmmddd") // v6
+INST(thumb16_REVSH, "REVSH", "1011101011mmmddd") // v6
+INST(thumb16_BKPT, "BKPT", "10111110xxxxxxxx") // v5
+
+// Store/Load multiple registers
+INST(thumb16_STMIA, "STMIA", "11000nnnxxxxxxxx")
+INST(thumb16_LDMIA, "LDMIA", "11001nnnxxxxxxxx")
+
+// Branch instructions
+INST(thumb16_BX, "BX", "010001110mmmm000") // v4T
+INST(thumb16_BLX_reg, "BLX (reg)", "010001111mmmm000") // v5T
+INST(thumb16_CBZ_CBNZ, "CBZ/CBNZ", "1011o0i1iiiiinnn") // v6T2
+INST(thumb16_UDF, "UDF", "11011110--------")
+INST(thumb16_SVC, "SVC", "11011111xxxxxxxx")
+INST(thumb16_B_t1, "B (T1)", "1101ccccvvvvvvvv")
+INST(thumb16_B_t2, "B (T2)", "11100vvvvvvvvvvv")
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h
new file mode 100644
index 0000000000..f3f4b3b9ee
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h
@@ -0,0 +1,38 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <optional>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A32 {
+
+template<typename Visitor>
+using Thumb32Matcher = Decoder::Matcher<Visitor, u32>;
+
+template<typename V>
+std::optional<std::reference_wrapper<const Thumb32Matcher<V>>> DecodeThumb32(u32 instruction) {
+ static const std::vector<Thumb32Matcher<V>> table = {
+
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Thumb32Matcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)),
+#include "./thumb32.inc"
+#undef INST
+
+ };
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ auto iter = std::find_if(table.begin(), table.end(), matches_instruction);
+ return iter != table.end() ? std::optional<std::reference_wrapper<const Thumb32Matcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc
new file mode 100644
index 0000000000..ecc7a65538
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc
@@ -0,0 +1,292 @@
+// Load/Store Multiple
+//INST(thumb32_SRS_1, "SRS", "1110100000-0--------------------")
+//INST(thumb32_RFE_2, "RFE", "1110100000-1--------------------")
+INST(thumb32_STMIA, "STMIA/STMEA", "1110100010W0nnnn0iiiiiiiiiiiiiii")
+INST(thumb32_POP, "POP", "1110100010111101iiiiiiiiiiiiiiii")
+INST(thumb32_LDMIA, "LDMIA/LDMFD", "1110100010W1nnnniiiiiiiiiiiiiiii")
+INST(thumb32_PUSH, "PUSH", "11101001001011010iiiiiiiiiiiiiii")
+INST(thumb32_STMDB, "STMDB/STMFD", "1110100100W0nnnn0iiiiiiiiiiiiiii")
+INST(thumb32_LDMDB, "LDMDB/LDMEA", "1110100100W1nnnniiiiiiiiiiiiiiii")
+//INST(thumb32_SRS_1, "SRS", "1110100110-0--------------------")
+//INST(thumb32_RFE_2, "RFE", "1110100110-1--------------------")
+
+// Load/Store Dual, Load/Store Exclusive, Table Branch
+INST(thumb32_STREX, "STREX", "111010000100nnnnttttddddiiiiiiii")
+INST(thumb32_LDREX, "LDREX", "111010000101nnnntttt1111iiiiiiii")
+INST(thumb32_STRD_imm_1, "STRD (imm)", "11101000U110nnnnttttssssiiiiiiii")
+INST(thumb32_STRD_imm_2, "STRD (imm)", "11101001U1W0nnnnttttssssiiiiiiii")
+INST(thumb32_LDRD_lit_1, "LDRD (lit)", "11101000U1111111ttttssssiiiiiiii")
+INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii")
+INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii")
+INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii")
+INST(thumb32_STL, "STL", "111010001100nnnntttt111110101111") // v8
+INST(thumb32_LDA, "LDA", "111010001101nnnntttt111110101111") // v8
+INST(thumb32_STREXB, "STREXB", "111010001100nnnntttt11110100dddd")
+INST(thumb32_STREXH, "STREXH", "111010001100nnnntttt11110101dddd")
+INST(thumb32_STREXD, "STREXD", "111010001100nnnnttttuuuu0111dddd")
+INST(thumb32_TBB, "TBB", "111010001101nnnn111100000000mmmm")
+INST(thumb32_TBH, "TBH", "111010001101nnnn111100000001mmmm")
+INST(thumb32_LDREXB, "LDREXB", "111010001101nnnntttt111101001111")
+INST(thumb32_LDREXH, "LDREXH", "111010001101nnnntttt111101011111")
+INST(thumb32_LDREXD, "LDREXD", "111010001101nnnnttttuuuu01111111")
+
+// Data Processing (Shifted Register)
+INST(thumb32_TST_reg, "TST (reg)", "111010100001nnnn0vvv1111vvttmmmm")
+INST(thumb32_AND_reg, "AND (reg)", "11101010000Snnnn0vvvddddvvttmmmm")
+INST(thumb32_BIC_reg, "BIC (reg)", "11101010001Snnnn0vvvddddvvttmmmm")
+INST(thumb32_MOV_reg, "MOV (reg)", "11101010010S11110vvvddddvvttmmmm")
+INST(thumb32_ORR_reg, "ORR (reg)", "11101010010Snnnn0vvvddddvvttmmmm")
+INST(thumb32_MVN_reg, "MVN (reg)", "11101010011S11110vvvddddvvttmmmm")
+INST(thumb32_ORN_reg, "ORN (reg)", "11101010011Snnnn0vvvddddvvttmmmm")
+INST(thumb32_TEQ_reg, "TEQ (reg)", "111010101001nnnn0vvv1111vvttmmmm")
+INST(thumb32_EOR_reg, "EOR (reg)", "11101010100Snnnn0vvvddddvvttmmmm")
+INST(thumb32_PKH, "PKH", "111010101100nnnn0vvvddddvvt0mmmm")
+INST(thumb32_CMN_reg, "CMN (reg)", "111010110001nnnn0vvv1111vvttmmmm")
+INST(thumb32_ADD_reg, "ADD (reg)", "11101011000Snnnn0vvvddddvvttmmmm")
+INST(thumb32_ADC_reg, "ADC (reg)", "11101011010Snnnn0vvvddddvvttmmmm")
+INST(thumb32_SBC_reg, "SBC (reg)", "11101011011Snnnn0vvvddddvvttmmmm")
+INST(thumb32_CMP_reg, "CMP (reg)", "111010111011nnnn0vvv1111vvttmmmm")
+INST(thumb32_SUB_reg, "SUB (reg)", "11101011101Snnnn0vvvddddvvttmmmm")
+INST(thumb32_RSB_reg, "RSB (reg)", "11101011110Snnnn0vvvddddvvttmmmm")
+
+// Data Processing (Modified Immediate)
+INST(thumb32_TST_imm, "TST (imm)", "11110v000001nnnn0vvv1111vvvvvvvv")
+INST(thumb32_AND_imm, "AND (imm)", "11110v00000Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_BIC_imm, "BIC (imm)", "11110v00001Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_MOV_imm, "MOV (imm)", "11110v00010S11110vvvddddvvvvvvvv")
+INST(thumb32_ORR_imm, "ORR (imm)", "11110v00010Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_MVN_imm, "MVN (imm)", "11110v00011S11110vvvddddvvvvvvvv")
+INST(thumb32_ORN_imm, "ORN (imm)", "11110v00011Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_TEQ_imm, "TEQ (imm)", "11110v001001nnnn0vvv1111vvvvvvvv")
+INST(thumb32_EOR_imm, "EOR (imm)", "11110v00100Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_CMN_imm, "CMN (imm)", "11110v010001nnnn0vvv1111vvvvvvvv")
+INST(thumb32_ADD_imm_1, "ADD (imm)", "11110v01000Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_ADC_imm, "ADC (imm)", "11110v01010Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_SBC_imm, "SBC (imm)", "11110v01011Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_CMP_imm, "CMP (imm)", "11110v011011nnnn0vvv1111vvvvvvvv")
+INST(thumb32_SUB_imm_1, "SUB (imm)", "11110v01101Snnnn0vvvddddvvvvvvvv")
+INST(thumb32_RSB_imm, "RSB (imm)", "11110v01110Snnnn0vvvddddvvvvvvvv")
+
+// Data Processing (Plain Binary Immediate)
+INST(thumb32_ADR_t3, "ADR", "11110i10000011110iiiddddiiiiiiii")
+INST(thumb32_ADD_imm_2, "ADD (imm)", "11110i100000nnnn0iiiddddiiiiiiii")
+INST(thumb32_MOVW_imm, "MOVW (imm)", "11110i100100iiii0iiiddddiiiiiiii")
+INST(thumb32_ADR_t2, "ADR", "11110i10101011110iiiddddiiiiiiii")
+INST(thumb32_SUB_imm_2, "SUB (imm)", "11110i101010nnnn0iiiddddiiiiiiii")
+INST(thumb32_MOVT, "MOVT", "11110i101100iiii0iiiddddiiiiiiii")
+INST(thumb32_UDF, "Invalid decoding", "11110011-010----0000----0001----")
+INST(thumb32_SSAT16, "SSAT16", "111100110010nnnn0000dddd0000iiii")
+INST(thumb32_USAT16, "USAT16", "111100111010nnnn0000dddd0000iiii")
+INST(thumb32_SSAT, "SSAT", "1111001100s0nnnn0iiiddddii0bbbbb")
+INST(thumb32_USAT, "USAT", "1111001110s0nnnn0iiiddddii0bbbbb")
+INST(thumb32_SBFX, "SBFX", "111100110100nnnn0iiiddddii0wwwww")
+INST(thumb32_BFC, "BFC", "11110011011011110iiiddddii0bbbbb")
+INST(thumb32_BFI, "BFI", "111100110110nnnn0iiiddddii0bbbbb")
+INST(thumb32_UBFX, "UBFX", "111100111100nnnn0iiiddddii0wwwww")
+
+// Branches and Miscellaneous Control
+//INST(thumb32_MSR_banked, "MSR (banked)", "11110011100-----10-0------1-----")
+INST(thumb32_MSR_reg, "MSR (reg)", "11110011100Rnnnn1000mmmm00000000")
+
+INST(thumb32_NOP, "NOP", "11110011101011111000000000000000")
+INST(thumb32_YIELD, "YIELD", "11110011101011111000000000000001")
+INST(thumb32_WFE, "WFE", "11110011101011111000000000000010")
+INST(thumb32_WFI, "WFI", "11110011101011111000000000000011")
+INST(thumb32_SEV, "SEV", "11110011101011111000000000000100")
+INST(thumb32_SEVL, "SEVL", "11110011101011111000000000000101")
+//INST(thumb32_DBG, "DBG", "111100111010----10-0-0001111----")
+//INST(thumb32_CPS, "CPS", "111100111010----10-0------------")
+
+//INST(thumb32_ENTERX, "ENTERX", "111100111011----10-0----0001----")
+//INST(thumb32_LEAVEX, "LEAVEX", "111100111011----10-0----0000----")
+INST(thumb32_CLREX, "CLREX", "11110011101111111000111100101111")
+INST(thumb32_DSB, "DSB", "1111001110111111100011110100oooo")
+INST(thumb32_DMB, "DMB", "1111001110111111100011110101oooo")
+INST(thumb32_ISB, "ISB", "1111001110111111100011110110oooo")
+
+INST(thumb32_BXJ, "BXJ", "111100111100mmmm1000111100000000")
+//INST(thumb32_ERET, "ERET", "11110011110111101000111100000000")
+//INST(thumb32_SUBS_pc_lr, "SUBS PC, LR", "111100111101111010001111--------")
+
+//INST(thumb32_MRS_banked, "MRS (banked)", "11110011111-----10-0------1-----")
+INST(thumb32_MRS_reg, "MRS (reg)", "11110011111R11111000dddd00000000")
+//INST(thumb32_HVC, "HVC", "111101111110----1000------------")
+//INST(thumb32_SMC, "SMC", "111101111111----1000000000000000")
+INST(thumb32_UDF, "UDF", "111101111111----1010------------") // v6T2
+
+// Branch instructions
+INST(thumb32_BL_imm, "BL (imm)", "11110Svvvvvvvvvv11j1jvvvvvvvvvvv") // v4T
+INST(thumb32_BLX_imm, "BLX (imm)", "11110Svvvvvvvvvv11j0jvvvvvvvvvvv") // v5T
+INST(thumb32_B, "B", "11110Svvvvvvvvvv10j1jvvvvvvvvvvv")
+INST(thumb32_UDF, "Invalid decoding", "11110-111-------10-0------------")
+INST(thumb32_B_cond, "B (cond)", "11110Sccccvvvvvv10i0ivvvvvvvvvvv")
+
+// Store Single Data Item
+INST(thumb32_STRB_imm_1, "STRB (imm)", "111110000000nnnntttt1PU1iiiiiiii")
+INST(thumb32_STRB_imm_2, "STRB (imm)", "111110000000nnnntttt1100iiiiiiii")
+INST(thumb32_STRB_imm_3, "STRB (imm)", "111110001000nnnnttttiiiiiiiiiiii")
+INST(thumb32_STRBT, "STRBT", "111110000000nnnntttt1110iiiiiiii")
+INST(thumb32_STRB, "STRB (reg)", "111110000000nnnntttt000000iimmmm")
+INST(thumb32_STRH_imm_1, "STRH (imm)", "111110000010nnnntttt1PU1iiiiiiii")
+INST(thumb32_STRH_imm_2, "STRH (imm)", "111110000010nnnntttt1100iiiiiiii")
+INST(thumb32_STRH_imm_3, "STRH (imm)", "111110001010nnnnttttiiiiiiiiiiii")
+INST(thumb32_STRHT, "STRHT", "111110000010nnnntttt1110iiiiiiii")
+INST(thumb32_STRH, "STRH (reg)", "111110000010nnnntttt000000iimmmm")
+INST(thumb32_STR_imm_1, "STR (imm)", "111110000100nnnntttt1PU1iiiiiiii")
+INST(thumb32_STR_imm_2, "STR (imm)", "111110000100nnnntttt1100iiiiiiii")
+INST(thumb32_STR_imm_3, "STR (imm)", "111110001100nnnnttttiiiiiiiiiiii")
+INST(thumb32_STRT, "STRT", "111110000100nnnntttt1110iiiiiiii")
+INST(thumb32_STR_reg, "STR (reg)", "111110000100nnnntttt000000iimmmm")
+
+// Load Byte and Memory Hints
+INST(thumb32_PLD_lit, "PLD (lit)", "11111000U00111111111iiiiiiiiiiii")
+INST(thumb32_PLD_lit, "PLD (lit)", "11111000U01111111111iiiiiiiiiiii")
+INST(thumb32_PLD_reg, "PLD (reg)", "1111100000W1nnnn1111000000iimmmm")
+INST(thumb32_PLD_imm8, "PLD (imm8)", "1111100000W1nnnn11111100iiiiiiii")
+INST(thumb32_PLD_imm12, "PLD (imm12)", "1111100010W1nnnn1111iiiiiiiiiiii")
+INST(thumb32_PLI_lit, "PLI (lit)", "11111001U00111111111iiiiiiiiiiii")
+INST(thumb32_PLI_reg, "PLI (reg)", "111110010001nnnn1111000000iimmmm")
+INST(thumb32_PLI_imm8, "PLI (imm8)", "111110010001nnnn11111100iiiiiiii")
+INST(thumb32_PLI_imm12, "PLI (imm12)", "111110011001nnnn1111iiiiiiiiiiii")
+INST(thumb32_LDRB_lit, "LDRB (lit)", "11111000U0011111ttttiiiiiiiiiiii")
+INST(thumb32_LDRB_reg, "LDRB (reg)", "111110000001nnnntttt000000iimmmm")
+INST(thumb32_LDRBT, "LDRBT", "111110000001nnnntttt1110iiiiiiii")
+INST(thumb32_LDRB_imm8, "LDRB (imm8)", "111110000001nnnntttt1PUWiiiiiiii")
+INST(thumb32_LDRB_imm12, "LDRB (imm12)", "111110001001nnnnttttiiiiiiiiiiii")
+INST(thumb32_LDRSB_lit, "LDRSB (lit)", "11111001U0011111ttttiiiiiiiiiiii")
+INST(thumb32_LDRSB_reg, "LDRSB (reg)", "111110010001nnnntttt000000iimmmm")
+INST(thumb32_LDRSBT, "LDRSBT", "111110010001nnnntttt1110iiiiiiii")
+INST(thumb32_LDRSB_imm8, "LDRSB (imm8)", "111110010001nnnntttt1PUWiiiiiiii")
+INST(thumb32_LDRSB_imm12, "LDRSB (imm12)", "111110011001nnnnttttiiiiiiiiiiii")
+
+// Load Halfword and Memory Hints
+INST(thumb32_LDRH_lit, "LDRH (lit)", "11111000U0111111ttttiiiiiiiiiiii")
+INST(thumb32_LDRH_reg, "LDRH (reg)", "111110000011nnnntttt000000iimmmm")
+INST(thumb32_LDRHT, "LDRHT", "111110000011nnnntttt1110iiiiiiii")
+INST(thumb32_LDRH_imm8, "LDRH (imm8)", "111110000011nnnntttt1PUWiiiiiiii")
+INST(thumb32_LDRH_imm12, "LDRH (imm12)", "111110001011nnnnttttiiiiiiiiiiii")
+INST(thumb32_NOP, "NOP", "11111001-01111111111------------")
+INST(thumb32_LDRSH_lit, "LDRSH (lit)", "11111001U0111111ttttiiiiiiiiiiii")
+INST(thumb32_NOP, "NOP", "111110010011----1111000000------")
+INST(thumb32_LDRSH_reg, "LDRSH (reg)", "111110010011nnnntttt000000iimmmm")
+INST(thumb32_LDRSHT, "LDRSHT", "111110010011nnnntttt1110iiiiiiii")
+INST(thumb32_NOP, "NOP", "111110010011----11111100--------")
+INST(thumb32_NOP, "NOP", "111110011011----1111------------")
+INST(thumb32_LDRSH_imm8, "LDRSH (imm8)", "111110010011nnnntttt1PUWiiiiiiii")
+INST(thumb32_LDRSH_imm12, "LDRSH (imm12)", "111110011011nnnnttttiiiiiiiiiiii")
+
+// Load Word
+INST(thumb32_LDR_lit, "LDR (lit)", "11111000U1011111ttttiiiiiiiiiiii")
+INST(thumb32_LDRT, "LDRT", "111110000101nnnntttt1110iiiiiiii")
+INST(thumb32_LDR_reg, "LDR (reg)", "111110000101nnnntttt000000iimmmm")
+INST(thumb32_LDR_imm8, "LDR (imm8)", "111110000101nnnntttt1PUWiiiiiiii")
+INST(thumb32_LDR_imm12, "LDR (imm12)", "111110001101nnnnttttiiiiiiiiiiii")
+
+// Data Processing (register)
+INST(thumb32_LSL_reg, "LSL (reg)", "11111010000Smmmm1111dddd0000ssss")
+INST(thumb32_LSR_reg, "LSR (reg)", "11111010001Smmmm1111dddd0000ssss")
+INST(thumb32_ASR_reg, "ASR (reg)", "11111010010Smmmm1111dddd0000ssss")
+INST(thumb32_ROR_reg, "ROR (reg)", "11111010011Smmmm1111dddd0000ssss")
+INST(thumb32_SXTH, "SXTH", "11111010000011111111dddd10rrmmmm")
+INST(thumb32_SXTAH, "SXTAH", "111110100000nnnn1111dddd10rrmmmm")
+INST(thumb32_UXTH, "UXTH", "11111010000111111111dddd10rrmmmm")
+INST(thumb32_UXTAH, "UXTAH", "111110100001nnnn1111dddd10rrmmmm")
+INST(thumb32_SXTB16, "SXTB16", "11111010001011111111dddd10rrmmmm")
+INST(thumb32_SXTAB16, "SXTAB16", "111110100010nnnn1111dddd10rrmmmm")
+INST(thumb32_UXTB16, "UXTB16", "11111010001111111111dddd10rrmmmm")
+INST(thumb32_UXTAB16, "UXTAB16", "111110100011nnnn1111dddd10rrmmmm")
+INST(thumb32_SXTB, "SXTB", "11111010010011111111dddd10rrmmmm")
+INST(thumb32_SXTAB, "SXTAB", "111110100100nnnn1111dddd10rrmmmm")
+INST(thumb32_UXTB, "UXTB", "11111010010111111111dddd10rrmmmm")
+INST(thumb32_UXTAB, "UXTAB", "111110100101nnnn1111dddd10rrmmmm")
+
+// Parallel Addition and Subtraction (signed)
+INST(thumb32_SADD16, "SADD16", "111110101001nnnn1111dddd0000mmmm")
+INST(thumb32_SASX, "SASX", "111110101010nnnn1111dddd0000mmmm")
+INST(thumb32_SSAX, "SSAX", "111110101110nnnn1111dddd0000mmmm")
+INST(thumb32_SSUB16, "SSUB16", "111110101101nnnn1111dddd0000mmmm")
+INST(thumb32_SADD8, "SADD8", "111110101000nnnn1111dddd0000mmmm")
+INST(thumb32_SSUB8, "SSUB8", "111110101100nnnn1111dddd0000mmmm")
+INST(thumb32_QADD16, "QADD16", "111110101001nnnn1111dddd0001mmmm")
+INST(thumb32_QASX, "QASX", "111110101010nnnn1111dddd0001mmmm")
+INST(thumb32_QSAX, "QSAX", "111110101110nnnn1111dddd0001mmmm")
+INST(thumb32_QSUB16, "QSUB16", "111110101101nnnn1111dddd0001mmmm")
+INST(thumb32_QADD8, "QADD8", "111110101000nnnn1111dddd0001mmmm")
+INST(thumb32_QSUB8, "QSUB8", "111110101100nnnn1111dddd0001mmmm")
+INST(thumb32_SHADD16, "SHADD16", "111110101001nnnn1111dddd0010mmmm")
+INST(thumb32_SHASX, "SHASX", "111110101010nnnn1111dddd0010mmmm")
+INST(thumb32_SHSAX, "SHSAX", "111110101110nnnn1111dddd0010mmmm")
+INST(thumb32_SHSUB16, "SHSUB16", "111110101101nnnn1111dddd0010mmmm")
+INST(thumb32_SHADD8, "SHADD8", "111110101000nnnn1111dddd0010mmmm")
+INST(thumb32_SHSUB8, "SHSUB8", "111110101100nnnn1111dddd0010mmmm")
+
+// Parallel Addition and Subtraction (unsigned)
+INST(thumb32_UADD16, "UADD16", "111110101001nnnn1111dddd0100mmmm")
+INST(thumb32_UASX, "UASX", "111110101010nnnn1111dddd0100mmmm")
+INST(thumb32_USAX, "USAX", "111110101110nnnn1111dddd0100mmmm")
+INST(thumb32_USUB16, "USUB16", "111110101101nnnn1111dddd0100mmmm")
+INST(thumb32_UADD8, "UADD8", "111110101000nnnn1111dddd0100mmmm")
+INST(thumb32_USUB8, "USUB8", "111110101100nnnn1111dddd0100mmmm")
+INST(thumb32_UQADD16, "UQADD16", "111110101001nnnn1111dddd0101mmmm")
+INST(thumb32_UQASX, "UQASX", "111110101010nnnn1111dddd0101mmmm")
+INST(thumb32_UQSAX, "UQSAX", "111110101110nnnn1111dddd0101mmmm")
+INST(thumb32_UQSUB16, "UQSUB16", "111110101101nnnn1111dddd0101mmmm")
+INST(thumb32_UQADD8, "UQADD8", "111110101000nnnn1111dddd0101mmmm")
+INST(thumb32_UQSUB8, "UQSUB8", "111110101100nnnn1111dddd0101mmmm")
+INST(thumb32_UHADD16, "UHADD16", "111110101001nnnn1111dddd0110mmmm")
+INST(thumb32_UHASX, "UHASX", "111110101010nnnn1111dddd0110mmmm")
+INST(thumb32_UHSAX, "UHSAX", "111110101110nnnn1111dddd0110mmmm")
+INST(thumb32_UHSUB16, "UHSUB16", "111110101101nnnn1111dddd0110mmmm")
+INST(thumb32_UHADD8, "UHADD8", "111110101000nnnn1111dddd0110mmmm")
+INST(thumb32_UHSUB8, "UHSUB8", "111110101100nnnn1111dddd0110mmmm")
+
+// Miscellaneous Operations
+INST(thumb32_QADD, "QADD", "111110101000nnnn1111dddd1000mmmm")
+INST(thumb32_QDADD, "QDADD", "111110101000nnnn1111dddd1001mmmm")
+INST(thumb32_QSUB, "QSUB", "111110101000nnnn1111dddd1010mmmm")
+INST(thumb32_QDSUB, "QDSUB", "111110101000nnnn1111dddd1011mmmm")
+INST(thumb32_REV, "REV", "111110101001nnnn1111dddd1000mmmm")
+INST(thumb32_REV16, "REV16", "111110101001nnnn1111dddd1001mmmm")
+INST(thumb32_RBIT, "RBIT", "111110101001nnnn1111dddd1010mmmm")
+INST(thumb32_REVSH, "REVSH", "111110101001nnnn1111dddd1011mmmm")
+INST(thumb32_SEL, "SEL", "111110101010nnnn1111dddd1000mmmm")
+INST(thumb32_CLZ, "CLZ", "111110101011nnnn1111dddd1000mmmm")
+
+// Multiply, Multiply Accumulate, and Absolute Difference
+INST(thumb32_MUL, "MUL", "111110110000nnnn1111dddd0000mmmm")
+INST(thumb32_MLA, "MLA", "111110110000nnnnaaaadddd0000mmmm")
+INST(thumb32_MLS, "MLS", "111110110000nnnnaaaadddd0001mmmm")
+INST(thumb32_SMULXY, "SMULXY", "111110110001nnnn1111dddd00NMmmmm")
+INST(thumb32_SMLAXY, "SMLAXY", "111110110001nnnnaaaadddd00NMmmmm")
+INST(thumb32_SMUAD, "SMUAD", "111110110010nnnn1111dddd000Mmmmm")
+INST(thumb32_SMLAD, "SMLAD", "111110110010nnnnaaaadddd000Xmmmm")
+INST(thumb32_SMULWY, "SMULWY", "111110110011nnnn1111dddd000Mmmmm")
+INST(thumb32_SMLAWY, "SMLAWY", "111110110011nnnnaaaadddd000Mmmmm")
+INST(thumb32_SMUSD, "SMUSD", "111110110100nnnn1111dddd000Mmmmm")
+INST(thumb32_SMLSD, "SMLSD", "111110110100nnnnaaaadddd000Xmmmm")
+INST(thumb32_SMMUL, "SMMUL", "111110110101nnnn1111dddd000Rmmmm")
+INST(thumb32_SMMLA, "SMMLA", "111110110101nnnnaaaadddd000Rmmmm")
+INST(thumb32_SMMLS, "SMMLS", "111110110110nnnnaaaadddd000Rmmmm")
+INST(thumb32_USAD8, "USAD8", "111110110111nnnn1111dddd0000mmmm")
+INST(thumb32_USADA8, "USADA8", "111110110111nnnnaaaadddd0000mmmm")
+
+// Long Multiply, Long Multiply Accumulate, and Divide
+INST(thumb32_SMULL, "SMULL", "111110111000nnnnllllhhhh0000mmmm")
+INST(thumb32_SDIV, "SDIV", "111110111001nnnn1111dddd1111mmmm")
+INST(thumb32_UMULL, "UMULL", "111110111010nnnnllllhhhh0000mmmm")
+INST(thumb32_UDIV, "UDIV", "111110111011nnnn1111dddd1111mmmm")
+INST(thumb32_SMLAL, "SMLAL", "111110111100nnnnllllhhhh0000mmmm")
+INST(thumb32_SMLALXY, "SMLALXY", "111110111100nnnnllllhhhh10NMmmmm")
+INST(thumb32_SMLALD, "SMLALD", "111110111100nnnnllllhhhh110Mmmmm")
+INST(thumb32_SMLSLD, "SMLSLD", "111110111101nnnnllllhhhh110Mmmmm")
+INST(thumb32_UMLAL, "UMLAL", "111110111110nnnnllllhhhh0000mmmm")
+INST(thumb32_UMAAL, "UMAAL", "111110111110nnnnllllhhhh0110mmmm")
+
+// Coprocessor
+INST(thumb32_MCRR, "MCRR", "111o11000100uuuuttttppppooooMMMM")
+INST(thumb32_MRRC, "MRRC", "111o11000101uuuuttttppppooooMMMM")
+INST(thumb32_STC, "STC", "111o110pudw0nnnnDDDDppppvvvvvvvv")
+INST(thumb32_LDC, "LDC", "111o110pudw1nnnnDDDDppppvvvvvvvv")
+INST(thumb32_CDP, "CDP", "111o1110ooooNNNNDDDDppppooo0MMMM")
+INST(thumb32_MCR, "MCR", "111o1110ooo0NNNNttttppppooo1MMMM")
+INST(thumb32_MRC, "MRC", "111o1110ooo1NNNNttttppppooo1MMMM")
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h
new file mode 100644
index 0000000000..f79a859bf7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h
@@ -0,0 +1,58 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2032 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <vector>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A32 {
+
+template<typename Visitor>
+using VFPMatcher = Decoder::Matcher<Visitor, u32>;
+
+template<typename V>
+std::optional<std::reference_wrapper<const VFPMatcher<V>>> DecodeVFP(u32 instruction) {
+ using Table = std::vector<VFPMatcher<V>>;
+
+ static const struct Tables {
+ Table unconditional;
+ Table conditional;
+ } tables = [] {
+ Table list = {
+
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(VFPMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)),
+#include "./vfp.inc"
+#undef INST
+
+ };
+
+ const auto division = std::stable_partition(list.begin(), list.end(), [&](const auto& matcher) {
+ return (matcher.GetMask() & 0xF0000000) == 0xF0000000;
+ });
+
+ return Tables{
+ Table{list.begin(), division},
+ Table{division, list.end()},
+ };
+ }();
+
+ const bool is_unconditional = (instruction & 0xF0000000) == 0xF0000000;
+ const Table& table = is_unconditional ? tables.unconditional : tables.conditional;
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ auto iter = std::find_if(table.begin(), table.end(), matches_instruction);
+ return iter != table.end() ? std::optional<std::reference_wrapper<const VFPMatcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc
new file mode 100644
index 0000000000..21c9b78d79
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc
@@ -0,0 +1,71 @@
+// Floating-point three-register data processing instructions
+INST(vfp_VMLA, "VMLA", "cccc11100D00nnnndddd101zN0M0mmmm") // VFPv2
+INST(vfp_VMLS, "VMLS", "cccc11100D00nnnndddd101zN1M0mmmm") // VFPv2
+INST(vfp_VNMLS, "VNMLS", "cccc11100D01nnnndddd101zN0M0mmmm") // VFPv2
+INST(vfp_VNMLA, "VNMLA", "cccc11100D01nnnndddd101zN1M0mmmm") // VFPv2
+INST(vfp_VMUL, "VMUL", "cccc11100D10nnnndddd101zN0M0mmmm") // VFPv2
+INST(vfp_VNMUL, "VNMUL", "cccc11100D10nnnndddd101zN1M0mmmm") // VFPv2
+INST(vfp_VADD, "VADD", "cccc11100D11nnnndddd101zN0M0mmmm") // VFPv2
+INST(vfp_VSUB, "VSUB", "cccc11100D11nnnndddd101zN1M0mmmm") // VFPv2
+INST(vfp_VDIV, "VDIV", "cccc11101D00nnnndddd101zN0M0mmmm") // VFPv2
+INST(vfp_VFNMS, "VFNMS", "cccc11101D01nnnndddd101zN0M0mmmm") // VFPv4
+INST(vfp_VFNMA, "VFNMA", "cccc11101D01nnnndddd101zN1M0mmmm") // VFPv4
+INST(vfp_VFMA, "VFMA", "cccc11101D10nnnndddd101zN0M0mmmm") // VFPv4
+INST(vfp_VFMS, "VFMS", "cccc11101D10nnnndddd101zN1M0mmmm") // VFPv4
+INST(vfp_VSEL, "VSEL", "111111100Dccnnnndddd101zN0M0mmmm") // VFPv5
+INST(vfp_VMAXNM, "VMAXNNM", "111111101D00nnnndddd101zN0M0mmmm") // VFPv5
+INST(vfp_VMINNM, "VMINNM", "111111101D00nnnndddd101zN1M0mmmm") // VFPv5
+
+// Other floating-point data-processing instructions
+INST(vfp_VMOV_imm, "VMOV (immediate)", "cccc11101D11vvvvdddd101z0000vvvv") // VFPv3
+INST(vfp_VMOV_reg, "VMOV (reg)", "cccc11101D110000dddd101z01M0mmmm") // VFPv2
+INST(vfp_VABS, "VABS", "cccc11101D110000dddd101z11M0mmmm") // VFPv2
+INST(vfp_VNEG, "VNEG", "cccc11101D110001dddd101z01M0mmmm") // VFPv2
+INST(vfp_VSQRT, "VSQRT", "cccc11101D110001dddd101z11M0mmmm") // VFPv2
+INST(vfp_VCVTB, "VCVTB", "cccc11101D11001odddd101z01M0mmmm") // VFPv3HP
+INST(vfp_VCVTT, "VCVTT", "cccc11101D11001odddd101z11M0mmmm") // VFPv3HP
+INST(vfp_VCMP, "VCMP", "cccc11101D110100dddd101zE1M0mmmm") // VFPv2
+INST(vfp_VCMP_zero, "VCMP (with zero)", "cccc11101D110101dddd101zE1000000") // VFPv2
+INST(vfp_VRINTR, "VRINTR", "cccc11101D110110dddd101z01M0mmmm") // VFPv5
+INST(vfp_VRINTZ, "VRINTZ", "cccc11101D110110dddd101z11M0mmmm") // VFPv5
+INST(vfp_VRINTX, "VRINTX", "cccc11101D110111dddd101z01M0mmmm") // VFPv5
+INST(vfp_VCVT_f_to_f, "VCVT (f32<->f64)", "cccc11101D110111dddd101z11M0mmmm") // VFPv2
+INST(vfp_VCVT_from_int, "VCVT (from int)", "cccc11101D111000dddd101zs1M0mmmm") // VFPv2
+INST(vfp_VCVT_from_fixed, "VCVT (from fixed)", "cccc11101D11101Udddd101zx1i0vvvv") // VFPv3
+INST(vfp_VCVT_to_u32, "VCVT (to u32)", "cccc11101D111100dddd101zr1M0mmmm") // VFPv2
+INST(vfp_VCVT_to_s32, "VCVT (to s32)", "cccc11101D111101dddd101zr1M0mmmm") // VFPv2
+INST(vfp_VCVT_to_fixed, "VCVT (to fixed)", "cccc11101D11111Udddd101zx1i0vvvv") // VFPv3
+INST(vfp_VRINT_rm, "VRINT{A,N,P,M}", "111111101D1110mmdddd101z01M0mmmm") // VFPv5
+INST(vfp_VCVT_rm, "VCVT{A,N,P,M}", "111111101D1111mmdddd101zU1M0mmmm") // VFPv5
+
+// Floating-point move instructions
+INST(vfp_VMOV_u32_f64, "VMOV (core to f64)", "cccc11100000ddddtttt1011D0010000") // VFPv2
+INST(vfp_VMOV_f64_u32, "VMOV (f64 to core)", "cccc11100001nnnntttt1011N0010000") // VFPv2
+INST(vfp_VMOV_u32_f32, "VMOV (core to f32)", "cccc11100000nnnntttt1010N0010000") // VFPv2
+INST(vfp_VMOV_f32_u32, "VMOV (f32 to core)", "cccc11100001nnnntttt1010N0010000") // VFPv2
+INST(vfp_VMOV_2u32_2f32, "VMOV (2xcore to 2xf32)", "cccc11000100uuuutttt101000M1mmmm") // VFPv2
+INST(vfp_VMOV_2f32_2u32, "VMOV (2xf32 to 2xcore)", "cccc11000101uuuutttt101000M1mmmm") // VFPv2
+INST(vfp_VMOV_2u32_f64, "VMOV (2xcore to f64)", "cccc11000100uuuutttt101100M1mmmm") // VFPv2
+INST(vfp_VMOV_f64_2u32, "VMOV (f64 to 2xcore)", "cccc11000101uuuutttt101100M1mmmm") // VFPv2
+INST(vfp_VMOV_from_i32, "VMOV (core to i32)" , "cccc111000i0nnnntttt1011N0010000") // VFPv4
+INST(vfp_VMOV_from_i16, "VMOV (core to i16)" , "cccc111000i0nnnntttt1011Ni110000") // ASIMD
+INST(vfp_VMOV_from_i8, "VMOV (core to i8)", "cccc111001i0nnnntttt1011Nii10000") // ASIMD
+INST(vfp_VMOV_to_i32, "VMOV (i32 to core)" , "cccc111000i1nnnntttt1011N0010000") // VFPv4
+INST(vfp_VMOV_to_i16, "VMOV (i16 to core)" , "cccc1110U0i1nnnntttt1011Ni110000") // ASIMD
+INST(vfp_VMOV_to_i8, "VMOV (i8 to core)", "cccc1110U1i1nnnntttt1011Nii10000") // ASIMD
+INST(vfp_VDUP, "VDUP (from core)", "cccc11101BQ0ddddtttt1011D0E10000") // ASIMD
+
+// Floating-point system register access
+INST(vfp_VMSR, "VMSR", "cccc111011100001tttt101000010000") // VFPv2
+INST(vfp_VMRS, "VMRS", "cccc111011110001tttt101000010000") // VFPv2
+
+// Extension register load-store instructions
+INST(vfp_VPUSH, "VPUSH", "cccc11010D101101dddd101zvvvvvvvv") // VFPv2
+INST(vfp_VPOP, "VPOP", "cccc11001D111101dddd101zvvvvvvvv") // VFPv2
+INST(vfp_VLDR, "VLDR", "cccc1101UD01nnnndddd101zvvvvvvvv") // VFPv2
+INST(vfp_VSTR, "VSTR", "cccc1101UD00nnnndddd101zvvvvvvvv") // VFPv2
+INST(arm_UDF, "Undefined VSTM/VLDM", "----11000-0---------101---------") // VFPv2
+INST(vfp_VSTM_a1, "VSTM (A1)", "cccc110puDw0nnnndddd1011vvvvvvvv") // VFPv2
+INST(vfp_VSTM_a2, "VSTM (A2)", "cccc110puDw0nnnndddd1010vvvvvvvv") // VFPv2
+INST(vfp_VLDM_a1, "VLDM (A1)", "cccc110puDw1nnnndddd1011vvvvvvvv") // VFPv2
+INST(vfp_VLDM_a2, "VLDM (A2)", "cccc110puDw1nnnndddd1010vvvvvvvv") // VFPv2
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h
new file mode 100644
index 0000000000..6a61afdefa
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h
@@ -0,0 +1,17 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::A32 {
+
+std::string DisassembleArm(u32 instruction);
+std::string DisassembleThumb16(u16 instruction);
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp
new file mode 100644
index 0000000000..ff0c48a1ce
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp
@@ -0,0 +1,1584 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstdlib>
+#include <string>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bit/rotate.hpp>
+
+#include "dynarmic/common/string_util.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/frontend/A32/decoder/arm.h"
+#include "dynarmic/frontend/A32/decoder/vfp.h"
+#include "dynarmic/frontend/A32/disassembler/disassembler.h"
+#include "dynarmic/frontend/imm.h"
+
+namespace Dynarmic::A32 {
+
+class DisassemblerVisitor {
+public:
+ using instruction_return_type = std::string;
+
+ static u32 ArmExpandImm(int rotate, Imm<8> imm8) {
+ return mcl::bit::rotate_right(static_cast<u32>(imm8.ZeroExtend()), rotate * 2);
+ }
+
+ static std::string ShiftStr(ShiftType shift, Imm<5> imm5) {
+ switch (shift) {
+ case ShiftType::LSL:
+ if (imm5 == 0)
+ return "";
+ return fmt::format(", lsl #{}", imm5.ZeroExtend());
+ case ShiftType::LSR:
+ if (imm5 == 0)
+ return ", lsr #32";
+ return fmt::format(", lsr #{}", imm5.ZeroExtend());
+ case ShiftType::ASR:
+ if (imm5 == 0)
+ return ", asr #32";
+ return fmt::format(", asr #{}", imm5.ZeroExtend());
+ case ShiftType::ROR:
+ if (imm5 == 0)
+ return ", rrx";
+ return fmt::format(", ror #{}", imm5.ZeroExtend());
+ }
+ ASSERT(false);
+ return "<internal error>";
+ }
+
+ static std::string RsrStr(Reg s, ShiftType shift, Reg m) {
+ switch (shift) {
+ case ShiftType::LSL:
+ return fmt::format("{}, lsl {}", m, s);
+ case ShiftType::LSR:
+ return fmt::format("{}, lsr {}", m, s);
+ case ShiftType::ASR:
+ return fmt::format("{}, asr {}", m, s);
+ case ShiftType::ROR:
+ return fmt::format("{}, ror {}", m, s);
+ }
+ ASSERT(false);
+ return "<internal error>";
+ }
+
+ static std::string RorStr(Reg m, SignExtendRotation rotate) {
+ switch (rotate) {
+ case SignExtendRotation::ROR_0:
+ return RegToString(m);
+ case SignExtendRotation::ROR_8:
+ return fmt::format("{}, ror #8", m);
+ case SignExtendRotation::ROR_16:
+ return fmt::format("{}, ror #16", m);
+ case SignExtendRotation::ROR_24:
+ return fmt::format("{}, ror #24", m);
+ }
+ ASSERT(false);
+ return "<internal error>";
+ }
+
+ static const char* BarrierOptionStr(Imm<4> option) {
+ switch (option.ZeroExtend()) {
+ case 0b0010:
+ return " oshst";
+ case 0b0011:
+ return " osh";
+ case 0b0110:
+ return " nshst";
+ case 0b0111:
+ return " nsh";
+ case 0b1010:
+ return " ishst";
+ case 0b1011:
+ return " ish";
+ case 0b1110:
+ return " st";
+ case 0b1111: // SY can be omitted.
+ return "";
+ default:
+ return " unknown";
+ }
+ }
+
+ static std::string FPRegStr(bool dp_operation, size_t base, bool bit) {
+ size_t reg_num;
+ if (dp_operation) {
+ reg_num = base + (bit ? 16 : 0);
+ } else {
+ reg_num = (base << 1) + (bit ? 1 : 0);
+ }
+ return fmt::format("{}{}", dp_operation ? 'd' : 's', reg_num);
+ }
+
+ static std::string FPNextRegStr(bool dp_operation, size_t base, bool bit) {
+ size_t reg_num;
+ if (dp_operation) {
+ reg_num = base + (bit ? 16 : 0);
+ } else {
+ reg_num = (base << 1) + (bit ? 1 : 0);
+ }
+ return fmt::format("{}{}", dp_operation ? 'd' : 's', reg_num + 1);
+ }
+
+ static std::string VectorStr(bool Q, size_t base, bool bit) {
+ size_t reg_num;
+ if (Q) {
+ reg_num = (base >> 1) + (bit ? 8 : 0);
+ } else {
+ reg_num = base + (bit ? 16 : 0);
+ }
+ return fmt::format("{}{}", Q ? 'q' : 'd', reg_num);
+ }
+
+ static std::string CondOrTwo(Cond cond) {
+ return cond == Cond::NV ? "2" : CondToString(cond);
+ }
+
+ // Barrier instructions
+ std::string arm_DMB(Imm<4> option) {
+ return fmt::format("dmb{}", BarrierOptionStr(option));
+ }
+ std::string arm_DSB(Imm<4> option) {
+ return fmt::format("dsb{}", BarrierOptionStr(option));
+ }
+ std::string arm_ISB([[maybe_unused]] Imm<4> option) {
+ return "isb";
+ }
+
+ // Branch instructions
+ std::string arm_B(Cond cond, Imm<24> imm24) {
+ const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8);
+ return fmt::format("b{} {}#{}", CondToString(cond), Common::SignToChar(offset), abs(offset));
+ }
+ std::string arm_BL(Cond cond, Imm<24> imm24) {
+ const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8);
+ return fmt::format("bl{} {}#{}", CondToString(cond), Common::SignToChar(offset), abs(offset));
+ }
+ std::string arm_BLX_imm(bool H, Imm<24> imm24) {
+ const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8 + (H ? 2 : 0));
+ return fmt::format("blx {}#{}", Common::SignToChar(offset), abs(offset));
+ }
+ std::string arm_BLX_reg(Cond cond, Reg m) {
+ return fmt::format("blx{} {}", CondToString(cond), m);
+ }
+ std::string arm_BX(Cond cond, Reg m) {
+ return fmt::format("bx{} {}", CondToString(cond), m);
+ }
+ std::string arm_BXJ(Cond cond, Reg m) {
+ return fmt::format("bxj{} {}", CondToString(cond), m);
+ }
+
+ // Coprocessor instructions
+ std::string arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ return fmt::format("cdp{} p{}, #{}, {}, {}, {}, #{}", CondToString(cond), coproc_no, opc1, CRd, CRn, CRm, opc2);
+ }
+
+ std::string arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ const u32 imm32 = static_cast<u32>(imm8.ZeroExtend()) << 2;
+ if (!p && !u && !d && !w) {
+ return "<undefined>";
+ }
+ if (p) {
+ return fmt::format("ldc{}{} {}, {}, [{}, #{}{}]{}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n, u ? "+" : "-", imm32,
+ w ? "!" : "");
+ }
+ if (!p && w) {
+ return fmt::format("ldc{}{} {}, {}, [{}], #{}{}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n, u ? "+" : "-", imm32);
+ }
+ if (!p && !w && u) {
+ return fmt::format("ldc{}{} {}, {}, [{}], {}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n, imm8.ZeroExtend());
+ }
+ UNREACHABLE();
+ }
+
+ std::string arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ return fmt::format("mcr{} p{}, #{}, {}, {}, {}, #{}", CondOrTwo(cond), coproc_no, opc1, t, CRn, CRm, opc2);
+ }
+
+ std::string arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ return fmt::format("mcr{} p{}, #{}, {}, {}, {}", CondOrTwo(cond), coproc_no, opc, t, t2, CRm);
+ }
+
+ std::string arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ return fmt::format("mrc{} p{}, #{}, {}, {}, {}, #{}", CondOrTwo(cond), coproc_no, opc1, t, CRn, CRm, opc2);
+ }
+
+ std::string arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ return fmt::format("mrrc{} p{}, #{}, {}, {}, {}", CondOrTwo(cond), coproc_no, opc, t, t2, CRm);
+ }
+
+ std::string arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ const u32 imm32 = static_cast<u32>(imm8.ZeroExtend()) << 2;
+ if (!p && !u && !d && !w) {
+ return "<undefined>";
+ }
+ if (p) {
+ return fmt::format("stc{}{} {}, {}, [{}, #{}{}]{}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n,
+ u ? "+" : "-", imm32, w ? "!" : "");
+ }
+ if (!p && w) {
+ return fmt::format("stc{}{} {}, {}, [{}], #{}{}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n,
+ u ? "+" : "-", imm32);
+ }
+ if (!p && !w && u) {
+ return fmt::format("stc{}{} {}, {}, [{}], {}", d ? "l" : "",
+ CondOrTwo(cond), coproc_no, CRd, n, imm8.ZeroExtend());
+ }
+ UNREACHABLE();
+ }
+
+ // CRC32 instructions
+ std::string arm_CRC32([[maybe_unused]] Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) {
+ static constexpr std::array data_type{
+ "b",
+ "h",
+ "w",
+ "invalid",
+ };
+
+ return fmt::format("crc32{} {}, {}, {}", data_type[sz.ZeroExtend()], d, n, m);
+ }
+ std::string arm_CRC32C([[maybe_unused]] Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) {
+ static constexpr std::array data_type{
+ "b",
+ "h",
+ "w",
+ "invalid",
+ };
+
+ return fmt::format("crc32c{} {}, {}, {}", data_type[sz.ZeroExtend()], d, n, m);
+ }
+
+ // Data processing instructions
+ std::string arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("adc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("adc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("adc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("add{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("add{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("add{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("and{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("and{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("and{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("bic{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("bic{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("bic{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ return fmt::format("cmn{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("cmn{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("cmn{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m));
+ }
+ std::string arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ return fmt::format("cmp{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("cmp{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("cmp{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m));
+ }
+ std::string arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("eor{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("eor{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("eor{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("mov{}{} {}, #{}", CondToString(cond), S ? "s" : "", d, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("mov{}{} {}, {}{}", CondToString(cond), S ? "s" : "", d, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("mov{}{} {}, {}", CondToString(cond), S ? "s" : "", d, RsrStr(s, shift, m));
+ }
+ std::string arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("mvn{}{} {}, #{}", CondToString(cond), S ? "s" : "", d, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("mvn{}{} {}, {}{}", CondToString(cond), S ? "s" : "", d, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("mvn{}{} {}, {}", CondToString(cond), S ? "s" : "", d, RsrStr(s, shift, m));
+ }
+ std::string arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("orr{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("orr{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("orr{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("rsb{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("rsb{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("rsb{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("rsc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("rsc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("rsc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("sbc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("sbc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("sbc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ return fmt::format("sub{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("sub{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("sub{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m));
+ }
+ std::string arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ return fmt::format("teq{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("teq{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("teq{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m));
+ }
+ std::string arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ return fmt::format("tst{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ return fmt::format("tst{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ return fmt::format("tst{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m));
+ }
+
+ // Exception generation instructions
+ std::string arm_BKPT(Cond cond, Imm<12> imm12, Imm<4> imm4) {
+ return fmt::format("bkpt{} #{}", CondToString(cond), concatenate(imm12, imm4).ZeroExtend());
+ }
+ std::string arm_SVC(Cond cond, Imm<24> imm24) {
+ return fmt::format("svc{} #{}", CondToString(cond), imm24.ZeroExtend());
+ }
+ std::string arm_UDF() {
+ return fmt::format("udf");
+ }
+
+ // Extension functions
+ std::string arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxtab{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxtab16{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxtah{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxtb{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+ std::string arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxtb16{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+ std::string arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("sxth{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+ std::string arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxtab{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxtab16{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxtah{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate));
+ }
+ std::string arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxtb{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+ std::string arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxtb16{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+ std::string arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ return fmt::format("uxth{} {}, {}", CondToString(cond), d, RorStr(m, rotate));
+ }
+
+ // Hint instructions
+ std::string arm_PLD_imm(bool add, bool R, Reg n, Imm<12> imm12) {
+ const char sign = add ? '+' : '-';
+ const char* const w = R ? "" : "w";
+
+ return fmt::format("pld{} [{}, #{}{:x}]", w, n, sign, imm12.ZeroExtend());
+ }
+ std::string arm_PLD_reg(bool add, bool R, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ const char sign = add ? '+' : '-';
+ const char* const w = R ? "" : "w";
+
+ return fmt::format("pld{} [{}, {}{}{}]", w, n, sign, m, ShiftStr(shift, imm5));
+ }
+ std::string arm_SEV() {
+ return "sev";
+ }
+ std::string arm_SEVL() {
+ return "sevl";
+ }
+ std::string arm_WFE() {
+ return "wfe";
+ }
+ std::string arm_WFI() {
+ return "wfi";
+ }
+ std::string arm_YIELD() {
+ return "yield";
+ }
+
+ // Load/Store instructions
+ std::string arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12) {
+ const bool P = true;
+ const bool W = false;
+ return arm_LDR_imm(cond, P, U, W, Reg::PC, t, imm12);
+ }
+ std::string arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ const u32 imm12_value = imm12.ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldr{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign,
+ imm12_value, W ? "!" : "");
+ } else {
+ return fmt::format("ldr{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign,
+ imm12_value, W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldr{} {}, [{}, {}{}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? "!" : "");
+ } else {
+ return fmt::format("ldr{} {}, [{}], {}{}{}{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12) {
+ const bool P = true;
+ const bool W = false;
+ return arm_LDRB_imm(cond, P, U, W, Reg::PC, t, imm12);
+ }
+ std::string arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ const u32 imm12_value = imm12.ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrb{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrb{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrb{} {}, [{}, {}{}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? "!" : "");
+ } else {
+ return fmt::format("ldrb{} {}, [{}], {}{}{}{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRBT() { return "ice"; }
+ std::string arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const bool P = true;
+ const bool W = false;
+ return arm_LDRD_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b);
+ }
+ std::string arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrd{} {}, {}, [{}, #{}{}]{}",
+ CondToString(cond), t, t + 1, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrd{} {}, {}, [{}], #{}{}{}",
+ CondToString(cond), t, t + 1, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrd{} {}, {}, [{}, {}{}]{}",
+ CondToString(cond), t, t + 1, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrd{} {}, {}, [{}], {}{}{}",
+ CondToString(cond), t, t + 1, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ return arm_LDRH_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b);
+ }
+ std::string arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrh{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrh{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrh{} {}, [{}, {}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrh{} {}, [{}], {}{}{}",
+ CondToString(cond), t, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRHT() { return "ice"; }
+ std::string arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const bool P = true;
+ const bool W = false;
+ return arm_LDRSB_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b);
+ }
+ std::string arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrsb{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrsb{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrsb{} {}, [{}, {}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrsb{} {}, [{}], {}{}{}",
+ CondToString(cond), t, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRSBT() { return "ice"; }
+ std::string arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const bool P = true;
+ const bool W = false;
+ return arm_LDRSH_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b);
+ }
+ std::string arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrsh{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrsh{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("ldrsh{} {}, [{}, {}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("ldrsh{} {}, [{}], {}{}{}",
+ CondToString(cond), t, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_LDRSHT() { return "ice"; }
+ std::string arm_LDRT() { return "ice"; }
+ std::string arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ const u32 imm12_value = imm12.ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("str{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? "!" : "");
+ } else {
+ return fmt::format("str{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("str{} {}, [{}, {}{}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? "!" : "");
+ } else {
+ return fmt::format("str{} {}, [{}], {}{}{}{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ const u32 imm12_value = imm12.ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strb{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? "!" : "");
+ } else {
+ return fmt::format("strb{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm12_value,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strb{} {}, [{}, {}{}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? "!" : "");
+ } else {
+ return fmt::format("strb{} {}, [{}], {}{}{}{}",
+ CondToString(cond), t, n, sign, m,
+ ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRBT() { return "ice"; }
+ std::string arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strd{} {}, {}, [{}, #{}{}]{}",
+ CondToString(cond), t, t + 1, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("strd{} {}, {}, [{}], #{}{}{}",
+ CondToString(cond), t, t + 1, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strd{} {}, {}, [{}, {}{}]{}",
+ CondToString(cond), t, t + 1, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("strd{} {}, {}, [{}], {}{}{}",
+ CondToString(cond), t, t + 1, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strh{} {}, [{}, #{}{}]{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? "!" : "");
+ } else {
+ return fmt::format("strh{} {}, [{}], #{}{}{}",
+ CondToString(cond), t, n, sign, imm32,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ const char sign = U ? '+' : '-';
+
+ if (P) {
+ return fmt::format("strd{} {}, [{}, {}{}]{}",
+ CondToString(cond), t, n, sign, m,
+ W ? "!" : "");
+ } else {
+ return fmt::format("strd{} {}, [{}], {}{}{}",
+ CondToString(cond), t, n, sign, m,
+ W ? " (err: W == 1!!!)" : "");
+ }
+ }
+ std::string arm_STRHT() { return "ice"; }
+ std::string arm_STRT() { return "ice"; }
+
+ // Load/Store multiple instructions
+ std::string arm_LDM(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("ldm{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_LDMDA(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("ldmda{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_LDMDB(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("ldmdb{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_LDMIB(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("ldmib{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_LDM_usr() { return "ice"; }
+ std::string arm_LDM_eret() { return "ice"; }
+ std::string arm_STM(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("stm{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_STMDA(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("stmda{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_STMDB(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("stmdb{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_STMIB(Cond cond, bool W, Reg n, RegList list) {
+ return fmt::format("stmib{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list));
+ }
+ std::string arm_STM_usr() { return "ice"; }
+
+ // Miscellaneous instructions
+ std::string arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb) {
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 width = msb.ZeroExtend() - lsb_value + 1;
+ return fmt::format("bfc{} {}, #{}, #{}",
+ CondToString(cond), d, lsb_value, width);
+ }
+ std::string arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n) {
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 width = msb.ZeroExtend() - lsb_value + 1;
+ return fmt::format("bfi{} {}, {}, #{}, #{}",
+ CondToString(cond), d, n, lsb_value, width);
+ }
+ std::string arm_CLZ(Cond cond, Reg d, Reg m) {
+ return fmt::format("clz{} {}, {}", CondToString(cond), d, m);
+ }
+ std::string arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) {
+ const u32 imm = concatenate(imm4, imm12).ZeroExtend();
+ return fmt::format("movt{} {}, #{}", CondToString(cond), d, imm);
+ }
+ std::string arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) {
+ const u32 imm = concatenate(imm4, imm12).ZeroExtend();
+ return fmt::format("movw{}, {}, #{}", CondToString(cond), d, imm);
+ }
+ std::string arm_NOP() {
+ return "nop";
+ }
+ std::string arm_RBIT(Cond cond, Reg d, Reg m) {
+ return fmt::format("rbit{} {}, {}", CondToString(cond), d, m);
+ }
+ std::string arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) {
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 width = widthm1.ZeroExtend() + 1;
+ return fmt::format("sbfx{} {}, {}, #{}, #{}",
+ CondToString(cond), d, n, lsb_value, width);
+ }
+ std::string arm_SEL(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("sel{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) {
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 width = widthm1.ZeroExtend() + 1;
+ return fmt::format("ubfx{} {}, {}, #{}, #{}",
+ CondToString(cond), d, n, lsb_value, width);
+ }
+
+ // Unsigned sum of absolute difference functions
+ std::string arm_USAD8(Cond cond, Reg d, Reg m, Reg n) {
+ return fmt::format("usad8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n) {
+ return fmt::format("usad8a{} {}, {}, {}, {}", CondToString(cond), d, n, m, a);
+ }
+
+ // Packing instructions
+ std::string arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) {
+ return fmt::format("pkhbt{} {}, {}, {}{}", CondToString(cond), d, n, m, ShiftStr(ShiftType::LSL, imm5));
+ }
+ std::string arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) {
+ return fmt::format("pkhtb{} {}, {}, {}{}", CondToString(cond), d, n, m, ShiftStr(ShiftType::ASR, imm5));
+ }
+
+ // Reversal instructions
+ std::string arm_REV(Cond cond, Reg d, Reg m) {
+ return fmt::format("rev{} {}, {}", CondToString(cond), d, m);
+ }
+ std::string arm_REV16(Cond cond, Reg d, Reg m) {
+ return fmt::format("rev16{} {}, {}", CondToString(cond), d, m);
+ }
+ std::string arm_REVSH(Cond cond, Reg d, Reg m) {
+ return fmt::format("revsh{} {}, {}", CondToString(cond), d, m);
+ }
+
+ // Saturation instructions
+ std::string arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) {
+ const u32 bit_position = sat_imm.ZeroExtend() + 1;
+ return fmt::format("ssat{} {}, #{}, {}{}",
+ CondToString(cond), d, bit_position, n,
+ ShiftStr(ShiftType(sh << 1), imm5));
+ }
+ std::string arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) {
+ const u32 bit_position = sat_imm.ZeroExtend() + 1;
+ return fmt::format("ssat16{} {}, #{}, {}",
+ CondToString(cond), d, bit_position, n);
+ }
+ std::string arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) {
+ return fmt::format("usat{} {}, #{}, {}{}",
+ CondToString(cond), d, sat_imm.ZeroExtend(), n,
+ ShiftStr(ShiftType(sh << 1), imm5));
+ }
+ std::string arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) {
+ return fmt::format("usat16{} {}, #{}, {}",
+ CondToString(cond), d, sat_imm.ZeroExtend(), n);
+ }
+
+ // Divide instructions
+ std::string arm_SDIV(Cond cond, Reg d, Reg m, Reg n) {
+ return fmt::format("sdiv{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UDIV(Cond cond, Reg d, Reg m, Reg n) {
+ return fmt::format("udiv{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+
+ // Multiply (Normal) instructions
+ std::string arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n) {
+ return fmt::format("mla{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n) {
+ return fmt::format("mls{} {}, {}, {}, {}", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n) {
+ return fmt::format("mul{}{} {}, {}, {}", S ? "s" : "", CondToString(cond), d, n, m);
+ }
+
+ // Multiply (Long) instructions
+ std::string arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ return fmt::format("smlal{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ return fmt::format("smull{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n) {
+ return fmt::format("umaal{} {}, {}, {}, {}", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ return fmt::format("umlal{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ return fmt::format("umull{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+
+ // Multiply (Halfword) instructions
+ std::string arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n) {
+ return fmt::format("smlal{}{}{} {}, {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n) {
+ return fmt::format("smla{}{}{} {}, {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n) {
+ return fmt::format("smul{}{}{} {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), d, n, m);
+ }
+
+ // Multiply (word by halfword) instructions
+ std::string arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ return fmt::format("smlaw{}{} {}, {}, {}, {}", M ? 't' : 'b', CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ return fmt::format("smulw{}{} {}, {}, {}", M ? 't' : 'b', CondToString(cond), d, n, m);
+ }
+
+ // Multiply (Most significant word) instructions
+ std::string arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) {
+ return fmt::format("smmla{}{} {}, {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) {
+ return fmt::format("smmls{}{} {}, {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n) {
+ return fmt::format("smmul{}{} {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m);
+ }
+
+ // Multiply (Dual) instructions
+ std::string arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ return fmt::format("smlad{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) {
+ return fmt::format("smlald{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ return fmt::format("smlsd{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m, a);
+ }
+ std::string arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) {
+ return fmt::format("smlsld{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), dLo, dHi, n, m);
+ }
+ std::string arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ return fmt::format("smuad{}{} {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m);
+ }
+ std::string arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ return fmt::format("smusd{}{} {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m);
+ }
+
+ // Parallel Add/Subtract (Modulo arithmetic) instructions
+ std::string arm_SADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("sadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("sadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("sasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SSAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("ssax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("ssub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("ssub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_USAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("usax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_USUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("usub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_USUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("usub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+
+ // Parallel Add/Subtract (Saturating) instructions
+ std::string arm_QADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_QADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_QASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_QSAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qsax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_QSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qsub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_QSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qsub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQSAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqsax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqsub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uqsub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+
+ // Parallel Add/Subtract (Halving) instructions
+ std::string arm_SHADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SHASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shsax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shsub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("shsub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhadd8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhadd16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHASX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhasx{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhsax{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhsub8{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+ std::string arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("uhsub16{} {}, {}, {}", CondToString(cond), d, n, m);
+ }
+
+ // Saturated Add/Subtract instructions
+ std::string arm_QADD(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qadd{} {}, {}, {}", CondToString(cond), d, m, n);
+ }
+ std::string arm_QSUB(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qsub{} {}, {}, {}", CondToString(cond), d, m, n);
+ }
+ std::string arm_QDADD(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qdadd{} {}, {}, {}", CondToString(cond), d, m, n);
+ }
+ std::string arm_QDSUB(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("qdsub{} {}, {}, {}", CondToString(cond), d, m, n);
+ }
+
+ // Synchronization Primitive instructions
+ std::string arm_CLREX() {
+ return "clrex";
+ }
+ std::string arm_SWP(Cond cond, Reg n, Reg t, Reg t2) {
+ return fmt::format("swp{} {}, {}, [{}]", CondToString(cond), t, t2, n);
+ }
+ std::string arm_SWPB(Cond cond, Reg n, Reg t, Reg t2) {
+ return fmt::format("swpb{} {}, {}, [{}]", CondToString(cond), t, t2, n);
+ }
+ std::string arm_LDA(Cond cond, Reg n, Reg t) {
+ return fmt::format("lda{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_LDAB(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldab{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_LDAH(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldah{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_LDAEX(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldaex{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_LDAEXB(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldaexb{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_LDAEXD(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldaexd{} {}, {}, [{}]", CondToString(cond), t, t + 1, n);
+ }
+ std::string arm_LDAEXH(Cond cond, Reg n, Reg t) {
+ return fmt::format("ldaexh{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_STL(Cond cond, Reg n, Reg t) {
+ return fmt::format("stl{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_STLB(Cond cond, Reg n, Reg t) {
+ return fmt::format("stlb{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_STLH(Cond cond, Reg n, Reg t) {
+ return fmt::format("stlh{} {}, [{}]", CondToString(cond), t, n);
+ }
+ std::string arm_STLEX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("stlex{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+ std::string arm_STLEXB(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("stlexb{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+ std::string arm_STLEXD(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("stlexd{} {}, {}, {}, [{}]", CondToString(cond), d, m, m + 1, n);
+ }
+ std::string arm_STLEXH(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("stlexh{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+ std::string arm_LDREX(Cond cond, Reg n, Reg d) {
+ return fmt::format("ldrex{} {}, [{}]", CondToString(cond), d, n);
+ }
+ std::string arm_LDREXB(Cond cond, Reg n, Reg d) {
+ return fmt::format("ldrexb{} {}, [{}]", CondToString(cond), d, n);
+ }
+ std::string arm_LDREXD(Cond cond, Reg n, Reg d) {
+ return fmt::format("ldrexd{} {}, {}, [{}]", CondToString(cond), d, d + 1, n);
+ }
+ std::string arm_LDREXH(Cond cond, Reg n, Reg d) {
+ return fmt::format("ldrexh{} {}, [{}]", CondToString(cond), d, n);
+ }
+ std::string arm_STREX(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("strex{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+ std::string arm_STREXB(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("strexb{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+ std::string arm_STREXD(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("strexd{} {}, {}, {}, [{}]", CondToString(cond), d, m, m + 1, n);
+ }
+ std::string arm_STREXH(Cond cond, Reg n, Reg d, Reg m) {
+ return fmt::format("strexh{} {}, {}, [{}]", CondToString(cond), d, m, n);
+ }
+
+ // Status register access instructions
+ std::string arm_CPS() { return "ice"; }
+ std::string arm_MRS(Cond cond, Reg d) {
+ return fmt::format("mrs{} {}, apsr", CondToString(cond), d);
+ }
+ std::string arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8) {
+ const bool write_c = mcl::bit::get_bit<0>(mask);
+ const bool write_x = mcl::bit::get_bit<1>(mask);
+ const bool write_s = mcl::bit::get_bit<2>(mask);
+ const bool write_f = mcl::bit::get_bit<3>(mask);
+ return fmt::format("msr{} cpsr_{}{}{}{}, #{}",
+ CondToString(cond),
+ write_c ? "c" : "",
+ write_x ? "x" : "",
+ write_s ? "s" : "",
+ write_f ? "f" : "",
+ ArmExpandImm(rotate, imm8));
+ }
+ std::string arm_MSR_reg(Cond cond, unsigned mask, Reg n) {
+ const bool write_c = mcl::bit::get_bit<0>(mask);
+ const bool write_x = mcl::bit::get_bit<1>(mask);
+ const bool write_s = mcl::bit::get_bit<2>(mask);
+ const bool write_f = mcl::bit::get_bit<3>(mask);
+ return fmt::format("msr{} cpsr_{}{}{}{}, {}",
+ CondToString(cond),
+ write_c ? "c" : "",
+ write_x ? "x" : "",
+ write_s ? "s" : "",
+ write_f ? "f" : "",
+ n);
+ }
+ std::string arm_RFE() { return "ice"; }
+ std::string arm_SETEND(bool E) {
+ return E ? "setend be" : "setend le";
+ }
+ std::string arm_SRS() { return "ice"; }
+
+ // Floating point arithmetic instructions
+ std::string vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vadd{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vsub{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vmul{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vmla{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vmls{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vnmul{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vnmla{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vnmls{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vdiv{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vfnms{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vfnma{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vfms{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vfma{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ const Cond cond = concatenate(cc, Imm<1>{cc.Bit<0>() != cc.Bit<1>()}, Imm<1>{0}).ZeroExtend<Cond>();
+ return fmt::format("vsel{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vmaxnm.{} {}, {}, {}", sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ return fmt::format("vminnm.{} {}, {}, {}", sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L) {
+ const auto imm8 = concatenate(imm4H, imm4L);
+
+ if (sz) {
+ const u64 sign = static_cast<u64>(imm8.Bit<7>());
+ const u64 exp = (imm8.Bit<6>() ? 0x3FC : 0x400) | imm8.Bits<4, 5, u64>();
+ const u64 fract = imm8.Bits<0, 3, u64>() << 48;
+ const u64 immediate = (sign << 63) | (exp << 52) | fract;
+ return fmt::format("vmov{}.f64 {}, #0x{:016x}", CondToString(cond), FPRegStr(sz, Vd, D), immediate);
+ } else {
+ const u32 sign = static_cast<u32>(imm8.Bit<7>());
+ const u32 exp = (imm8.Bit<6>() ? 0x7C : 0x80) | imm8.Bits<4, 5>();
+ const u32 fract = imm8.Bits<0, 3>() << 19;
+ const u32 immediate = (sign << 31) | (exp << 23) | fract;
+ return fmt::format("vmov{}.f32 {}, #0x{:08x}", CondToString(cond), FPRegStr(sz, Vd, D), immediate);
+ }
+ }
+
+ std::string vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D) {
+ return fmt::format("vmov{}.32 {}, {}", CondToString(cond), FPRegStr(true, Vd, D), t);
+ }
+
+ std::string vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N) {
+ return fmt::format("vmov{}.32 {}, {}", CondToString(cond), t, FPRegStr(true, Vn, N));
+ }
+
+ std::string vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N) {
+ return fmt::format("vmov{}.32 {}, {}", CondToString(cond), FPRegStr(false, Vn, N), t);
+ }
+
+ std::string vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N) {
+ return fmt::format("vmov{}.32 {}, {}", CondToString(cond), t, FPRegStr(false, Vn, N));
+ }
+
+ std::string vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ return fmt::format("vmov{} {}, {}, {}, {}", CondToString(cond), FPRegStr(false, Vm, M), FPNextRegStr(false, Vm, M), t, t2);
+ }
+
+ std::string vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ return fmt::format("vmov{} {}, {}, {}, {}", CondToString(cond), t, t2, FPRegStr(false, Vm, M), FPNextRegStr(false, Vm, M));
+ }
+
+ std::string vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ return fmt::format("vmov{} {}, {}, {}", CondToString(cond), FPRegStr(true, Vm, M), t, t2);
+ }
+
+ std::string vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ return fmt::format("vmov{} {}, {}, {}", CondToString(cond), t, t2, FPRegStr(true, Vm, M));
+ }
+
+ std::string vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D) {
+ const size_t index = i.ZeroExtend();
+ return fmt::format("vmov{}.32 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
+ }
+
+ std::string vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2) {
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ return fmt::format("vmov{}.16 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
+ }
+
+ std::string vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2) {
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ return fmt::format("vmov{}.8 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
+ }
+
+ std::string vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N) {
+ const size_t index = i.ZeroExtend();
+ return fmt::format("vmov{}.32 {}, {}[{}]", CondToString(cond), t, FPRegStr(true, Vn, N), index);
+ }
+
+ std::string vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2) {
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ return fmt::format("vmov{}.{}16 {}, {}[{}]", CondToString(cond), U ? 'u' : 's', t, FPRegStr(true, Vn, N), index);
+ }
+
+ std::string vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2) {
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ return fmt::format("vmov{}.{}8 {}, {}[{}]", CondToString(cond), U ? 'u' : 's', t, FPRegStr(true, Vn, N), index);
+ }
+
+ std::string vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E) {
+ const size_t esize = 32u >> concatenate(B, E).ZeroExtend();
+ return fmt::format("vdup{}.{} {}, {}", CondToString(cond), esize, VectorStr(Q, Vd, D), t);
+ }
+
+ std::string vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vmov{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vadd{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vneg{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vsqrt{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) {
+ const bool convert_from_half = !op;
+ const char* const to = convert_from_half ? (sz ? "f64" : "f32") : "f16";
+ const char* const from = convert_from_half ? "f16" : (sz ? "f64" : "f32");
+ return fmt::format("vcvtb{}.{}.{} {}, {}", CondToString(cond), to, from, FPRegStr(convert_from_half ? sz : false, Vd, D), FPRegStr(convert_from_half ? false : sz, Vm, M));
+ }
+
+ std::string vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) {
+ const bool convert_from_half = !op;
+ const char* const to = convert_from_half ? (sz ? "f64" : "f32") : "f16";
+ const char* const from = convert_from_half ? "f16" : (sz ? "f64" : "f32");
+ return fmt::format("vcvtt{}.{}.{} {}, {}", CondToString(cond), to, from, FPRegStr(convert_from_half ? sz : false, Vd, D), FPRegStr(convert_from_half ? false : sz, Vm, M));
+ }
+
+ std::string vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm) {
+ return fmt::format("vcmp{}{}.{} {}, {}", E ? "e" : "", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E) {
+ return fmt::format("vcmp{}{}.{} {}, #0.0", E ? "e" : "", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D));
+ }
+
+ std::string vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vrintr{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vrintz{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vrintx{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vcvt{}.{}.{} {}, {}", CondToString(cond), !sz ? "f64" : "f32", sz ? "f64" : "f32", FPRegStr(!sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm) {
+ return fmt::format("vcvt{}.{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", is_signed ? "s32" : "u32", FPRegStr(sz, Vd, D), FPRegStr(false, Vm, M));
+ }
+
+ std::string vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) {
+ const size_t size = sx ? 32 : 16;
+ const size_t fbits = size - concatenate(imm4, i).ZeroExtend();
+ return fmt::format("vcvt{}.{}.{}{} {}, {}, #{}", CondToString(cond), sz ? "f64" : "f32", U ? 'u' : 's', size, FPRegStr(sz, Vd, D), FPRegStr(sz, Vd, D), fbits);
+ }
+
+ std::string vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) {
+ return fmt::format("vcvt{}{}.u32.{} {}, {}", round_towards_zero ? "" : "r", CondToString(cond), sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) {
+ return fmt::format("vcvt{}{}.s32.{} {}, {}", round_towards_zero ? "" : "r", CondToString(cond), sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) {
+ const size_t size = sx ? 32 : 16;
+ const size_t fbits = size - concatenate(imm4, i).ZeroExtend();
+ return fmt::format("vcvt{}.{}{}.{} {}, {}, #{}", CondToString(cond), U ? 'u' : 's', size, sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vd, D), fbits);
+ }
+
+ std::string vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm) {
+ return fmt::format("vrint{}.{} {}, {}", "anpm"[rm], sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm) {
+ return fmt::format("vcvt{}.{}.{} {}, {}", "anpm"[rm], U ? "u32" : "s32", sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M));
+ }
+
+ std::string vfp_VMSR(Cond cond, Reg t) {
+ return fmt::format("vmsr{} fpscr, {}", CondToString(cond), t);
+ }
+
+ std::string vfp_VMRS(Cond cond, Reg t) {
+ if (t == Reg::R15) {
+ return fmt::format("vmrs{} apsr_nzcv, fpscr", CondToString(cond));
+ } else {
+ return fmt::format("vmrs{} {}, fpscr", CondToString(cond), t);
+ }
+ }
+
+ std::string vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) {
+ return fmt::format("vpop{} {}(+{})",
+ CondToString(cond), FPRegStr(sz, Vd, D),
+ imm8.ZeroExtend() >> (sz ? 1 : 0));
+ }
+
+ std::string vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) {
+ return fmt::format("vpush{} {}(+{})",
+ CondToString(cond), FPRegStr(sz, Vd, D),
+ imm8.ZeroExtend() >> (sz ? 1 : 0));
+ }
+
+ std::string vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const char sign = U ? '+' : '-';
+ return fmt::format("vldr{} {}, [{}, #{}{}]",
+ CondToString(cond), FPRegStr(sz, Vd, D), n, sign, imm32);
+ }
+
+ std::string vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const char sign = U ? '+' : '-';
+ return fmt::format("vstr{} {}, [{}, #{}{}]",
+ CondToString(cond), FPRegStr(sz, Vd, D), n, sign, imm32);
+ }
+
+ std::string vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ const char* mode = "<invalid mode>";
+ if (!p && u) {
+ mode = "ia";
+ }
+ if (p && !u) {
+ mode = "db";
+ }
+ return fmt::format("vstm{}{}.f64 {}{}, {}(+{})", mode,
+ CondToString(cond), n, w ? "!" : "",
+ FPRegStr(true, Vd, D), imm8.ZeroExtend());
+ }
+
+ std::string vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ const char* mode = "<invalid mode>";
+ if (!p && u) {
+ mode = "ia";
+ }
+ if (p && !u) {
+ mode = "db";
+ }
+ return fmt::format("vstm{}{}.f32 {}{}, {}(+{})", mode,
+ CondToString(cond), n, w ? "!" : "",
+ FPRegStr(false, Vd, D), imm8.ZeroExtend());
+ }
+
+ std::string vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ const char* mode = "<invalid mode>";
+ if (!p && u) {
+ mode = "ia";
+ }
+ if (p && !u) {
+ mode = "db";
+ }
+ return fmt::format("vldm{}{}.f64 {}{}, {}(+{})", mode,
+ CondToString(cond), n, w ? "!" : "",
+ FPRegStr(true, Vd, D), imm8.ZeroExtend());
+ }
+
+ std::string vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ const char* mode = "<invalid mode>";
+ if (!p && u) {
+ mode = "ia";
+ }
+ if (p && !u) {
+ mode = "db";
+ }
+ return fmt::format("vldm{}{}.f32 {}{}, {}(+{})", mode,
+ CondToString(cond), n, w ? "!" : "",
+ FPRegStr(false, Vd, D), imm8.ZeroExtend());
+ }
+};
+
+std::string DisassembleArm(u32 instruction) {
+ DisassemblerVisitor visitor;
+ if (auto vfp_decoder = DecodeVFP<DisassemblerVisitor>(instruction)) {
+ return vfp_decoder->get().call(visitor, instruction);
+ } else if (auto decoder = DecodeArm<DisassemblerVisitor>(instruction)) {
+ return decoder->get().call(visitor, instruction);
+ } else {
+ return fmt::format("UNKNOWN: {:x}", instruction);
+ }
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp
new file mode 100644
index 0000000000..860181311b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp
@@ -0,0 +1,397 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstdlib>
+#include <string>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/common/string_util.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/frontend/A32/decoder/thumb16.h"
+#include "dynarmic/frontend/A32/disassembler/disassembler.h"
+#include "dynarmic/frontend/imm.h"
+
+namespace Dynarmic::A32 {
+
+class DisassemblerVisitor {
+public:
+ using instruction_return_type = std::string;
+
+ std::string thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d) {
+ return fmt::format("lsls {}, {}, #{}", d, m, imm5.ZeroExtend());
+ }
+
+ std::string thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d) {
+ const u32 shift = imm5 != 0 ? imm5.ZeroExtend() : 32U;
+ return fmt::format("lsrs {}, {}, #{}", d, m, shift);
+ }
+
+ std::string thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d) {
+ const u32 shift = imm5 != 0 ? imm5.ZeroExtend() : 32U;
+ return fmt::format("asrs {}, {}, #{}", d, m, shift);
+ }
+
+ std::string thumb16_ADD_reg_t1(Reg m, Reg n, Reg d) {
+ return fmt::format("adds {}, {}, {}", d, n, m);
+ }
+
+ std::string thumb16_SUB_reg(Reg m, Reg n, Reg d) {
+ return fmt::format("subs {}, {}, {}", d, n, m);
+ }
+
+ std::string thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d) {
+ return fmt::format("adds {}, {}, #{}", d, n, imm3.ZeroExtend());
+ }
+
+ std::string thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d) {
+ return fmt::format("subs {}, {}, #{}", d, n, imm3.ZeroExtend());
+ }
+
+ std::string thumb16_MOV_imm(Reg d, Imm<8> imm8) {
+ return fmt::format("movs {}, #{}", d, imm8.ZeroExtend());
+ }
+
+ std::string thumb16_CMP_imm(Reg n, Imm<8> imm8) {
+ return fmt::format("cmp {}, #{}", n, imm8.ZeroExtend());
+ }
+
+ std::string thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8) {
+ return fmt::format("adds {}, #{}", d_n, imm8.ZeroExtend());
+ }
+
+ std::string thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8) {
+ return fmt::format("subs {}, #{}", d_n, imm8.ZeroExtend());
+ }
+
+ std::string thumb16_AND_reg(Reg m, Reg d_n) {
+ return fmt::format("ands {}, {}", d_n, m);
+ }
+
+ std::string thumb16_EOR_reg(Reg m, Reg d_n) {
+ return fmt::format("eors {}, {}", d_n, m);
+ }
+
+ std::string thumb16_LSL_reg(Reg m, Reg d_n) {
+ return fmt::format("lsls {}, {}", d_n, m);
+ }
+
+ std::string thumb16_LSR_reg(Reg m, Reg d_n) {
+ return fmt::format("lsrs {}, {}", d_n, m);
+ }
+
+ std::string thumb16_ASR_reg(Reg m, Reg d_n) {
+ return fmt::format("asrs {}, {}", d_n, m);
+ }
+
+ std::string thumb16_ADC_reg(Reg m, Reg d_n) {
+ return fmt::format("adcs {}, {}", d_n, m);
+ }
+
+ std::string thumb16_SBC_reg(Reg m, Reg d_n) {
+ return fmt::format("sbcs {}, {}", d_n, m);
+ }
+
+ std::string thumb16_ROR_reg(Reg m, Reg d_n) {
+ return fmt::format("rors {}, {}", d_n, m);
+ }
+
+ std::string thumb16_TST_reg(Reg m, Reg n) {
+ return fmt::format("tst {}, {}", n, m);
+ }
+
+ std::string thumb16_RSB_imm(Reg n, Reg d) {
+ // Pre-UAL syntax: NEGS <Rd>, <Rn>
+ return fmt::format("rsbs {}, {}, #0", d, n);
+ }
+
+ std::string thumb16_CMP_reg_t1(Reg m, Reg n) {
+ return fmt::format("cmp {}, {}", n, m);
+ }
+
+ std::string thumb16_CMN_reg(Reg m, Reg n) {
+ return fmt::format("cmn {}, {}", n, m);
+ }
+
+ std::string thumb16_ORR_reg(Reg m, Reg d_n) {
+ return fmt::format("orrs {}, {}", d_n, m);
+ }
+
+ std::string thumb16_MUL_reg(Reg n, Reg d_m) {
+ return fmt::format("muls {}, {}, {}", d_m, n, d_m);
+ }
+
+ std::string thumb16_BIC_reg(Reg m, Reg d_n) {
+ return fmt::format("bics {}, {}", d_n, m);
+ }
+
+ std::string thumb16_MVN_reg(Reg m, Reg d) {
+ return fmt::format("mvns {}, {}", d, m);
+ }
+
+ std::string thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo) {
+ const Reg d_n = d_n_hi ? (d_n_lo + 8) : d_n_lo;
+ return fmt::format("add {}, {}", d_n, m);
+ }
+
+ std::string thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo) {
+ const Reg n = n_hi ? (n_lo + 8) : n_lo;
+ return fmt::format("cmp {}, {}", n, m);
+ }
+
+ std::string thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo) {
+ const Reg d = d_hi ? (d_lo + 8) : d_lo;
+ return fmt::format("mov {}, {}", d, m);
+ }
+
+ std::string thumb16_LDR_literal(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ return fmt::format("ldr {}, [pc, #{}]", t, imm32);
+ }
+
+ std::string thumb16_STR_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("str {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_STRH_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("strh {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_STRB_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("strb {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_LDRSB_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("ldrsb {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_LDR_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("ldr {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_LDRH_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("ldrh {}, [%s, %s]", t, n, m);
+ }
+
+ std::string thumb16_LDRB_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("ldrb {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_LDRSH_reg(Reg m, Reg n, Reg t) {
+ return fmt::format("ldrsh {}, [{}, {}]", t, n, m);
+ }
+
+ std::string thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 2;
+ return fmt::format("str {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 2;
+ return fmt::format("ldr {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend();
+ return fmt::format("strb {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend();
+ return fmt::format("ldrb {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 1;
+ return fmt::format("strh {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 1;
+ return fmt::format("ldrh {}, [{}, #{}]", t, n, imm32);
+ }
+
+ std::string thumb16_STR_imm_t2(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ return fmt::format("str {}, [sp, #{}]", t, imm32);
+ }
+
+ std::string thumb16_LDR_imm_t2(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ return fmt::format("ldr {}, [sp, #{}]", t, imm32);
+ }
+
+ std::string thumb16_ADR(Reg d, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ return fmt::format("adr {}, +#{}", d, imm32);
+ }
+
+ std::string thumb16_ADD_sp_t1(Reg d, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ return fmt::format("add {}, sp, #{}", d, imm32);
+ }
+
+ std::string thumb16_ADD_sp_t2(Imm<7> imm7) {
+ const u32 imm32 = imm7.ZeroExtend() << 2;
+ return fmt::format("add sp, sp, #{}", imm32);
+ }
+
+ std::string thumb16_SUB_sp(Imm<7> imm7) {
+ const u32 imm32 = imm7.ZeroExtend() << 2;
+ return fmt::format("sub sp, sp, #{}", imm32);
+ }
+
+ std::string thumb16_NOP() {
+ return "nop";
+ }
+
+ std::string thumb16_SEV() {
+ return "sev";
+ }
+
+ std::string thumb16_SEVL() {
+ return "sevl";
+ }
+
+ std::string thumb16_WFE() {
+ return "wfe";
+ }
+
+ std::string thumb16_WFI() {
+ return "wfi";
+ }
+
+ std::string thumb16_YIELD() {
+ return "yield";
+ }
+
+ std::string thumb16_IT(Imm<8> imm8) {
+ const Cond firstcond = imm8.Bits<4, 7, Cond>();
+ const bool firstcond0 = imm8.Bit<4>();
+ const auto [x, y, z] = [&] {
+ if (imm8.Bits<0, 3>() == 0b1000) {
+ return std::make_tuple("", "", "");
+ }
+ if (imm8.Bits<0, 2>() == 0b100) {
+ return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", "", "");
+ }
+ if (imm8.Bits<0, 1>() == 0b10) {
+ return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", imm8.Bit<2>() == firstcond0 ? "t" : "e", "");
+ }
+ // Sanity note: Here imm8.Bit<0>() is guaranteed to be == 1. (imm8 can never be 0bxxxx0000)
+ return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", imm8.Bit<2>() == firstcond0 ? "t" : "e", imm8.Bit<1>() == firstcond0 ? "t" : "e");
+ }();
+ return fmt::format("it{}{}{} {}", x, y, z, CondToString(firstcond));
+ }
+
+ std::string thumb16_SXTH(Reg m, Reg d) {
+ return fmt::format("sxth {}, {}", d, m);
+ }
+
+ std::string thumb16_SXTB(Reg m, Reg d) {
+ return fmt::format("sxtb {}, {}", d, m);
+ }
+
+ std::string thumb16_UXTH(Reg m, Reg d) {
+ return fmt::format("uxth {}, {}", d, m);
+ }
+
+ std::string thumb16_UXTB(Reg m, Reg d) {
+ return fmt::format("uxtb {}, {}", d, m);
+ }
+
+ std::string thumb16_PUSH(bool M, RegList reg_list) {
+ if (M)
+ reg_list |= 1 << 14;
+ return fmt::format("push {{{}}}", RegListToString(reg_list));
+ }
+
+ std::string thumb16_POP(bool P, RegList reg_list) {
+ if (P)
+ reg_list |= 1 << 15;
+ return fmt::format("pop {{{}}}", RegListToString(reg_list));
+ }
+
+ std::string thumb16_SETEND(bool E) {
+ return fmt::format("setend {}", E ? "BE" : "LE");
+ }
+
+ std::string thumb16_CPS(bool im, bool a, bool i, bool f) {
+ return fmt::format("cps{} {}{}{}", im ? "id" : "ie", a ? "a" : "", i ? "i" : "", f ? "f" : "");
+ }
+
+ std::string thumb16_REV(Reg m, Reg d) {
+ return fmt::format("rev {}, {}", d, m);
+ }
+
+ std::string thumb16_REV16(Reg m, Reg d) {
+ return fmt::format("rev16 {}, {}", d, m);
+ }
+
+ std::string thumb16_REVSH(Reg m, Reg d) {
+ return fmt::format("revsh {}, {}", d, m);
+ }
+
+ std::string thumb16_BKPT(Imm<8> imm8) {
+ return fmt::format("bkpt #{}", imm8.ZeroExtend());
+ }
+
+ std::string thumb16_STMIA(Reg n, RegList reg_list) {
+ return fmt::format("stm {}!, {{{}}}", n, RegListToString(reg_list));
+ }
+
+ std::string thumb16_LDMIA(Reg n, RegList reg_list) {
+ const bool write_back = !mcl::bit::get_bit(static_cast<size_t>(n), reg_list);
+ return fmt::format("ldm {}{}, {{{}}}", n, write_back ? "!" : "", RegListToString(reg_list));
+ }
+
+ std::string thumb16_BX(Reg m) {
+ return fmt::format("bx {}", m);
+ }
+
+ std::string thumb16_BLX_reg(Reg m) {
+ return fmt::format("blx {}", m);
+ }
+
+ std::string thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n) {
+ const char* const name = nonzero ? "cbnz" : "cbz";
+ const u32 imm = concatenate(i, imm5, Imm<1>{0}).ZeroExtend();
+
+ return fmt::format("{} {}, #{}", name, n, imm);
+ }
+
+ std::string thumb16_UDF() {
+ return fmt::format("udf");
+ }
+
+ std::string thumb16_SVC(Imm<8> imm8) {
+ return fmt::format("svc #{}", imm8.ZeroExtend());
+ }
+
+ std::string thumb16_B_t1(Cond cond, Imm<8> imm8) {
+ const s32 imm32 = static_cast<s32>((imm8.SignExtend<u32>() << 1) + 4);
+ return fmt::format("b{} {}#{}",
+ CondToString(cond),
+ Common::SignToChar(imm32),
+ abs(imm32));
+ }
+
+ std::string thumb16_B_t2(Imm<11> imm11) {
+ const s32 imm32 = static_cast<s32>((imm11.SignExtend<u32>() << 1) + 4);
+ return fmt::format("b {}#{}",
+ Common::SignToChar(imm32),
+ abs(imm32));
+ }
+};
+
+std::string DisassembleThumb16(u16 instruction) {
+ DisassemblerVisitor visitor;
+ auto decoder = DecodeThumb16<DisassemblerVisitor>(instruction);
+ return !decoder ? fmt::format("UNKNOWN: {:x}", instruction) : decoder->get().call(visitor, instruction);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp
new file mode 100644
index 0000000000..97a7f11adf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp
@@ -0,0 +1,27 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/ir/basic_block.h"
+
+namespace Dynarmic::A32 {
+
+IR::Block TranslateArm(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options);
+IR::Block TranslateThumb(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options);
+
+IR::Block Translate(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) {
+ return (descriptor.TFlag() ? TranslateThumb : TranslateArm)(descriptor, tcb, options);
+}
+
+bool TranslateSingleArmInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction);
+bool TranslateSingleThumbInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction);
+
+bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction) {
+ return (descriptor.TFlag() ? TranslateSingleThumbInstruction : TranslateSingleArmInstruction)(block, descriptor, instruction);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h
new file mode 100644
index 0000000000..0f2c3a121f
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h
@@ -0,0 +1,52 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/interface/A32/arch_version.h"
+
+namespace Dynarmic::IR {
+class Block;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::A32 {
+
+class LocationDescriptor;
+struct TranslateCallbacks;
+
+struct TranslationOptions {
+ ArchVersion arch_version;
+
+ /// This changes what IR we emit when we translate an unpredictable instruction.
+ /// If this is false, the ExceptionRaised IR instruction is emitted.
+ /// If this is true, we define some behaviour for some instructions.
+ bool define_unpredictable_behaviour = false;
+
+ /// This changes what IR we emit when we translate a hint instruction.
+ /// If this is false, we treat the instruction as a NOP.
+ /// If this is true, we emit an ExceptionRaised instruction.
+ bool hook_hint_instructions = true;
+};
+
+/**
+ * This function translates instructions in memory into our intermediate representation.
+ * @param descriptor The starting location of the basic block. Includes information like PC, Thumb state, &c.
+ * @param tcb The callbacks we should use to read emulated memory.
+ * @param options Configures how certain instructions are translated.
+ * @return A translated basic block in the intermediate representation.
+ */
+IR::Block Translate(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options);
+
+/**
+ * This function translates a single provided instruction into our intermediate representation.
+ * @param block The block to append the IR for the instruction to.
+ * @param descriptor The location of the instruction. Includes information like PC, Thumb state, &c.
+ * @param instruction The instruction to translate.
+ * @return The translated instruction translated to the intermediate representation.
+ */
+bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction);
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp
new file mode 100644
index 0000000000..8bd875c7d1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp
@@ -0,0 +1,82 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/conditional_state.h"
+
+#include <algorithm>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/a32_ir_emitter.h"
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/ir/cond.h"
+
+namespace Dynarmic::A32 {
+
+bool CondCanContinue(ConditionalState cond_state, const A32::IREmitter& ir) {
+ ASSERT_MSG(cond_state != ConditionalState::Break, "Should never happen.");
+
+ if (cond_state == ConditionalState::None)
+ return true;
+
+ // TODO: This is more conservative than necessary.
+ return std::all_of(ir.block.begin(), ir.block.end(), [](const IR::Inst& inst) { return !inst.WritesToCPSR(); });
+}
+
+bool IsConditionPassed(TranslatorVisitor& v, IR::Cond cond) {
+ ASSERT_MSG(v.cond_state != ConditionalState::Break,
+ "This should never happen. We requested a break but that wasn't honored.");
+
+ if (cond == IR::Cond::NV) {
+ // NV conditional is obsolete
+ v.cond_state = ConditionalState::Break;
+ v.RaiseException(Exception::UnpredictableInstruction);
+ return false;
+ }
+
+ if (v.cond_state == ConditionalState::Translating) {
+ if (v.ir.block.ConditionFailedLocation() != v.ir.current_location || cond == IR::Cond::AL) {
+ v.cond_state = ConditionalState::Trailing;
+ } else {
+ if (cond == v.ir.block.GetCondition()) {
+ v.ir.block.SetConditionFailedLocation(v.ir.current_location.AdvancePC(static_cast<int>(v.current_instruction_size)).AdvanceIT());
+ v.ir.block.ConditionFailedCycleCount()++;
+ return true;
+ }
+
+ // cond has changed, abort
+ v.cond_state = ConditionalState::Break;
+ v.ir.SetTerm(IR::Term::LinkBlockFast{v.ir.current_location});
+ return false;
+ }
+ }
+
+ if (cond == IR::Cond::AL) {
+ // Everything is fine with the world
+ return true;
+ }
+
+ // non-AL cond
+
+ if (!v.ir.block.empty()) {
+ // We've already emitted instructions. Quit for now, we'll make a new block here later.
+ v.cond_state = ConditionalState::Break;
+ v.ir.SetTerm(IR::Term::LinkBlockFast{v.ir.current_location});
+ return false;
+ }
+
+ // We've not emitted instructions yet.
+ // We'll emit one instruction, and set the block-entry conditional appropriately.
+
+ v.cond_state = ConditionalState::Translating;
+ v.ir.block.SetCondition(cond);
+ v.ir.block.SetConditionFailedLocation(v.ir.current_location.AdvancePC(static_cast<int>(v.current_instruction_size)).AdvanceIT());
+ v.ir.block.ConditionFailedCycleCount() = v.ir.block.CycleCount() + 1;
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h
new file mode 100644
index 0000000000..18c8b1cccb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h
@@ -0,0 +1,33 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::IR {
+enum class Cond;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::A32 {
+
+class IREmitter;
+struct TranslatorVisitor;
+
+enum class ConditionalState {
+ /// We haven't met any conditional instructions yet.
+ None,
+ /// Current instruction is a conditional. This marks the end of this basic block.
+ Break,
+ /// This basic block is made up solely of conditional instructions.
+ Translating,
+ /// This basic block is made up of conditional instructions followed by unconditional instructions.
+ Trailing,
+};
+
+bool CondCanContinue(ConditionalState cond_state, const A32::IREmitter& ir);
+bool IsConditionPassed(TranslatorVisitor& v, IR::Cond cond);
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp
new file mode 100644
index 0000000000..d87cfcfe82
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp
@@ -0,0 +1,88 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// B <label>
+bool TranslatorVisitor::arm_B(Cond cond, Imm<24> imm24) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8;
+ const auto new_location = ir.current_location.AdvancePC(imm32);
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+// BL <label>
+bool TranslatorVisitor::arm_BL(Cond cond, Imm<24> imm24) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.PushRSB(ir.current_location.AdvancePC(4));
+ ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4));
+
+ const u32 imm32 = mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8;
+ const auto new_location = ir.current_location.AdvancePC(imm32);
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+// BLX <label>
+bool TranslatorVisitor::arm_BLX_imm(bool H, Imm<24> imm24) {
+ ir.PushRSB(ir.current_location.AdvancePC(4));
+ ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4));
+
+ const u32 imm32 = mcl::bit::sign_extend<26, u32>((imm24.ZeroExtend() << 2)) + (H ? 2 : 0) + 8;
+ const auto new_location = ir.current_location.AdvancePC(imm32).SetTFlag(true);
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+// BLX <Rm>
+bool TranslatorVisitor::arm_BLX_reg(Cond cond, Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.PushRSB(ir.current_location.AdvancePC(4));
+ ir.BXWritePC(ir.GetRegister(m));
+ ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+// BX <Rm>
+bool TranslatorVisitor::arm_BX(Cond cond, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.BXWritePC(ir.GetRegister(m));
+ if (m == Reg::R14) {
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ } else {
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ }
+
+ return false;
+}
+
+bool TranslatorVisitor::arm_BXJ(Cond cond, Reg m) {
+ // Jazelle not supported
+ return arm_BX(cond, m);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp
new file mode 100644
index 0000000000..8ee5441013
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp
@@ -0,0 +1,95 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// It's considered constrained UNPREDICTABLE behavior if either
+// CRC32 instruction variant is executed with a condition code
+// that is *not* 0xE (Always execute). ARM defines one of the following
+// as being a requirement in this case. Either:
+//
+// 1. The instruction is undefined.
+// 2. The instruction executes as a NOP.
+// 3. The instruction executes unconditionally.
+// 4. The instruction executes conditionally.
+//
+// It's also considered constrained UNPREDICTABLE behavior if
+// either CRC32 instruction variant is executed with a size specifier
+// of 64-bit (sz -> 0b11)
+//
+// In this case, either:
+//
+// 1. The instruction is undefined
+// 2. The instruction executes as a NOP.
+// 3. The instruction executes with the additional decode: size = 32.
+//
+// In both cases, we treat as unpredictable, to allow
+// library users to provide their own intended behavior
+// in the unpredictable exception handler.
+
+namespace {
+enum class CRCType {
+ Castagnoli,
+ ISO,
+};
+
+bool CRC32Variant(TranslatorVisitor& v, Cond cond, Imm<2> sz, Reg n, Reg d, Reg m, CRCType type) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ if (sz == 0b11) {
+ return v.UnpredictableInstruction();
+ }
+
+ if (cond != Cond::AL) {
+ return v.UnpredictableInstruction();
+ }
+
+ const IR::U32 result = [m, n, sz, type, &v] {
+ const IR::U32 accumulator = v.ir.GetRegister(n);
+ const IR::U32 data = v.ir.GetRegister(m);
+
+ if (type == CRCType::ISO) {
+ switch (sz.ZeroExtend()) {
+ case 0b00:
+ return v.ir.CRC32ISO8(accumulator, data);
+ case 0b01:
+ return v.ir.CRC32ISO16(accumulator, data);
+ case 0b10:
+ return v.ir.CRC32ISO32(accumulator, data);
+ }
+ } else {
+ switch (sz.ZeroExtend()) {
+ case 0b00:
+ return v.ir.CRC32Castagnoli8(accumulator, data);
+ case 0b01:
+ return v.ir.CRC32Castagnoli16(accumulator, data);
+ case 0b10:
+ return v.ir.CRC32Castagnoli32(accumulator, data);
+ }
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.ir.SetRegister(d, result);
+ return true;
+}
+} // Anonymous namespace
+
+// CRC32{B,H,W}{<q>} <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_CRC32(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) {
+ return CRC32Variant(*this, cond, sz, n, d, m, CRCType::ISO);
+}
+
+// CRC32C{B,H,W}{<q>} <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_CRC32C(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) {
+ return CRC32Variant(*this, cond, sz, n, d, m, CRCType::Castagnoli);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp
new file mode 100644
index 0000000000..c9616b0871
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp
@@ -0,0 +1,44 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+
+// BKPT #<imm16>
+bool TranslatorVisitor::arm_BKPT(Cond cond, Imm<12> /*imm12*/, Imm<4> /*imm4*/) {
+ if (cond != Cond::AL && !options.define_unpredictable_behaviour) {
+ return UnpredictableInstruction();
+ }
+ // UNPREDICTABLE: The instruction executes conditionally.
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ return RaiseException(Exception::Breakpoint);
+}
+
+// SVC<c> #<imm24>
+bool TranslatorVisitor::arm_SVC(Cond cond, Imm<24> imm24) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm24.ZeroExtend();
+ ir.PushRSB(ir.current_location.AdvancePC(4));
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.CallSupervisor(ir.Imm32(imm32));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}});
+ return false;
+}
+
+// UDF<c> #<imm16>
+bool TranslatorVisitor::arm_UDF() {
+ return UndefinedInstruction();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp
new file mode 100644
index 0000000000..276f8384e7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp
@@ -0,0 +1,110 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::ArmConditionPassed(Cond cond) {
+ return IsConditionPassed(*this, cond);
+}
+
+bool TranslatorVisitor::ThumbConditionPassed() {
+ const Cond cond = ir.current_location.IT().Cond();
+ return IsConditionPassed(*this, cond);
+}
+
+bool TranslatorVisitor::VFPConditionPassed(Cond cond) {
+ if (ir.current_location.TFlag()) {
+ ASSERT(cond == Cond::AL);
+ return true;
+ }
+ return ArmConditionPassed(cond);
+}
+
+bool TranslatorVisitor::InterpretThisInstruction() {
+ ir.SetTerm(IR::Term::Interpret(ir.current_location));
+ return false;
+}
+
+bool TranslatorVisitor::UnpredictableInstruction() {
+ return RaiseException(Exception::UnpredictableInstruction);
+}
+
+bool TranslatorVisitor::UndefinedInstruction() {
+ return RaiseException(Exception::UndefinedInstruction);
+}
+
+bool TranslatorVisitor::DecodeError() {
+ return RaiseException(Exception::DecodeError);
+}
+
+bool TranslatorVisitor::RaiseException(Exception exception) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + static_cast<u32>(current_instruction_size)));
+ ir.ExceptionRaised(exception);
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}});
+ return false;
+}
+
+IR::UAny TranslatorVisitor::I(size_t bitsize, u64 value) {
+ switch (bitsize) {
+ case 8:
+ return ir.Imm8(static_cast<u8>(value));
+ case 16:
+ return ir.Imm16(static_cast<u16>(value));
+ case 32:
+ return ir.Imm32(static_cast<u32>(value));
+ case 64:
+ return ir.Imm64(value);
+ default:
+ ASSERT_FALSE("Imm - get: Invalid bitsize");
+ }
+}
+
+IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitImmShift(IR::U32 value, ShiftType type, Imm<3> imm3, Imm<2> imm2, IR::U1 carry_in) {
+ return EmitImmShift(value, type, concatenate(imm3, imm2), carry_in);
+}
+
+IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitImmShift(IR::U32 value, ShiftType type, Imm<5> imm5, IR::U1 carry_in) {
+ u8 imm5_value = imm5.ZeroExtend<u8>();
+ switch (type) {
+ case ShiftType::LSL:
+ return ir.LogicalShiftLeft(value, ir.Imm8(imm5_value), carry_in);
+ case ShiftType::LSR:
+ imm5_value = imm5_value ? imm5_value : 32;
+ return ir.LogicalShiftRight(value, ir.Imm8(imm5_value), carry_in);
+ case ShiftType::ASR:
+ imm5_value = imm5_value ? imm5_value : 32;
+ return ir.ArithmeticShiftRight(value, ir.Imm8(imm5_value), carry_in);
+ case ShiftType::ROR:
+ if (imm5_value) {
+ return ir.RotateRight(value, ir.Imm8(imm5_value), carry_in);
+ } else {
+ return ir.RotateRightExtended(value, carry_in);
+ }
+ }
+ UNREACHABLE();
+}
+
+IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitRegShift(IR::U32 value, ShiftType type, IR::U8 amount, IR::U1 carry_in) {
+ switch (type) {
+ case ShiftType::LSL:
+ return ir.LogicalShiftLeft(value, amount, carry_in);
+ case ShiftType::LSR:
+ return ir.LogicalShiftRight(value, amount, carry_in);
+ case ShiftType::ASR:
+ return ir.ArithmeticShiftRight(value, amount, carry_in);
+ case ShiftType::ROR:
+ return ir.RotateRight(value, amount, carry_in);
+ }
+ UNREACHABLE();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h
new file mode 100644
index 0000000000..44ac24503c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h
@@ -0,0 +1,982 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bit/rotate.hpp>
+
+#include "dynarmic/frontend/A32/a32_ir_emitter.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+#include "dynarmic/frontend/A32/translate/conditional_state.h"
+#include "dynarmic/frontend/imm.h"
+
+namespace Dynarmic::A32 {
+
+enum class Exception;
+
+struct TranslatorVisitor final {
+ using instruction_return_type = bool;
+
+ explicit TranslatorVisitor(IR::Block& block, LocationDescriptor descriptor, const TranslationOptions& options)
+ : ir(block, descriptor, options.arch_version), options(options) {}
+
+ A32::IREmitter ir;
+ ConditionalState cond_state = ConditionalState::None;
+ TranslationOptions options;
+
+ size_t current_instruction_size;
+
+ bool ArmConditionPassed(Cond cond);
+ bool ThumbConditionPassed();
+ bool VFPConditionPassed(Cond cond);
+
+ bool InterpretThisInstruction();
+ bool UnpredictableInstruction();
+ bool UndefinedInstruction();
+ bool DecodeError();
+ bool RaiseException(Exception exception);
+
+ struct ImmAndCarry {
+ u32 imm32;
+ IR::U1 carry;
+ };
+
+ ImmAndCarry ArmExpandImm_C(int rotate, Imm<8> imm8, IR::U1 carry_in) {
+ u32 imm32 = imm8.ZeroExtend();
+ auto carry_out = carry_in;
+ if (rotate) {
+ imm32 = mcl::bit::rotate_right<u32>(imm8.ZeroExtend(), rotate * 2);
+ carry_out = ir.Imm1(mcl::bit::get_bit<31>(imm32));
+ }
+ return {imm32, carry_out};
+ }
+
+ u32 ArmExpandImm(int rotate, Imm<8> imm8) {
+ return ArmExpandImm_C(rotate, imm8, ir.Imm1(0)).imm32;
+ }
+
+ ImmAndCarry ThumbExpandImm_C(Imm<1> i, Imm<3> imm3, Imm<8> imm8, IR::U1 carry_in) {
+ const Imm<12> imm12 = concatenate(i, imm3, imm8);
+ if (imm12.Bits<10, 11>() == 0) {
+ const u32 imm32 = [&] {
+ const u32 imm8 = imm12.Bits<0, 7>();
+ switch (imm12.Bits<8, 9>()) {
+ case 0b00:
+ return imm8;
+ case 0b01:
+ return mcl::bit::replicate_element<u16, u32>(imm8);
+ case 0b10:
+ return mcl::bit::replicate_element<u16, u32>(imm8 << 8);
+ case 0b11:
+ return mcl::bit::replicate_element<u8, u32>(imm8);
+ }
+ UNREACHABLE();
+ }();
+ return {imm32, carry_in};
+ }
+ const u32 imm32 = mcl::bit::rotate_right<u32>((1 << 7) | imm12.Bits<0, 6>(), imm12.Bits<7, 11>());
+ return {imm32, ir.Imm1(mcl::bit::get_bit<31>(imm32))};
+ }
+
+ u32 ThumbExpandImm(Imm<1> i, Imm<3> imm3, Imm<8> imm8) {
+ return ThumbExpandImm_C(i, imm3, imm8, ir.Imm1(0)).imm32;
+ }
+
+ // Creates an immediate of the given value
+ IR::UAny I(size_t bitsize, u64 value);
+
+ IR::ResultAndCarry<IR::U32> EmitImmShift(IR::U32 value, ShiftType type, Imm<3> imm3, Imm<2> imm2, IR::U1 carry_in);
+ IR::ResultAndCarry<IR::U32> EmitImmShift(IR::U32 value, ShiftType type, Imm<5> imm5, IR::U1 carry_in);
+ IR::ResultAndCarry<IR::U32> EmitRegShift(IR::U32 value, ShiftType type, IR::U8 amount, IR::U1 carry_in);
+ template<typename FnT>
+ bool EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg n, ExtReg m, const FnT& fn);
+ template<typename FnT>
+ bool EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg m, const FnT& fn);
+
+ // Barrier instructions
+ bool arm_DMB(Imm<4> option);
+ bool arm_DSB(Imm<4> option);
+ bool arm_ISB(Imm<4> option);
+
+ // Branch instructions
+ bool arm_B(Cond cond, Imm<24> imm24);
+ bool arm_BL(Cond cond, Imm<24> imm24);
+ bool arm_BLX_imm(bool H, Imm<24> imm24);
+ bool arm_BLX_reg(Cond cond, Reg m);
+ bool arm_BX(Cond cond, Reg m);
+ bool arm_BXJ(Cond cond, Reg m);
+
+ // Coprocessor instructions
+ bool arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm);
+ bool arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8);
+ bool arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm);
+ bool arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm);
+ bool arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm);
+ bool arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm);
+ bool arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8);
+
+ // CRC32 instructions
+ bool arm_CRC32(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m);
+ bool arm_CRC32C(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m);
+
+ // Data processing instructions
+ bool arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8);
+ bool arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m);
+ bool arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8);
+ bool arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m);
+ bool arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8);
+ bool arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8);
+ bool arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8);
+ bool arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m);
+ bool arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8);
+ bool arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m);
+ bool arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8);
+ bool arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m);
+
+ // Exception generating instructions
+ bool arm_BKPT(Cond cond, Imm<12> imm12, Imm<4> imm4);
+ bool arm_SVC(Cond cond, Imm<24> imm24);
+ bool arm_UDF();
+
+ // Extension instructions
+ bool arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+ bool arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m);
+
+ // Hint instructions
+ bool arm_PLD_imm(bool add, bool R, Reg n, Imm<12> imm12);
+ bool arm_PLD_reg(bool add, bool R, Reg n, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_SEV();
+ bool arm_SEVL();
+ bool arm_WFE();
+ bool arm_WFI();
+ bool arm_YIELD();
+
+ // Load/Store
+ bool arm_LDRBT();
+ bool arm_LDRHT();
+ bool arm_LDRSBT();
+ bool arm_LDRSHT();
+ bool arm_LDRT();
+ bool arm_STRBT();
+ bool arm_STRHT();
+ bool arm_STRT();
+ bool arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12);
+ bool arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Imm<12> imm12);
+ bool arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12);
+ bool arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12);
+ bool arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Reg m);
+ bool arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Reg m);
+ bool arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m);
+ bool arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m);
+ bool arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12);
+ bool arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12);
+ bool arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m);
+ bool arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m);
+ bool arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b);
+ bool arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m);
+
+ // Load/Store multiple instructions
+ bool arm_LDM(Cond cond, bool W, Reg n, RegList list);
+ bool arm_LDMDA(Cond cond, bool W, Reg n, RegList list);
+ bool arm_LDMDB(Cond cond, bool W, Reg n, RegList list);
+ bool arm_LDMIB(Cond cond, bool W, Reg n, RegList list);
+ bool arm_LDM_usr();
+ bool arm_LDM_eret();
+ bool arm_STM(Cond cond, bool W, Reg n, RegList list);
+ bool arm_STMDA(Cond cond, bool W, Reg n, RegList list);
+ bool arm_STMDB(Cond cond, bool W, Reg n, RegList list);
+ bool arm_STMIB(Cond cond, bool W, Reg n, RegList list);
+ bool arm_STM_usr();
+
+ // Miscellaneous instructions
+ bool arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb);
+ bool arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n);
+ bool arm_CLZ(Cond cond, Reg d, Reg m);
+ bool arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12);
+ bool arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12);
+ bool arm_NOP() { return true; }
+ bool arm_RBIT(Cond cond, Reg d, Reg m);
+ bool arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n);
+ bool arm_SEL(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n);
+
+ // Unsigned sum of absolute difference functions
+ bool arm_USAD8(Cond cond, Reg d, Reg m, Reg n);
+ bool arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n);
+
+ // Packing instructions
+ bool arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m);
+ bool arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m);
+
+ // Reversal instructions
+ bool arm_REV(Cond cond, Reg d, Reg m);
+ bool arm_REV16(Cond cond, Reg d, Reg m);
+ bool arm_REVSH(Cond cond, Reg d, Reg m);
+
+ // Saturation instructions
+ bool arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n);
+ bool arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n);
+ bool arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n);
+ bool arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n);
+
+ // Divide instructions
+ bool arm_SDIV(Cond cond, Reg d, Reg m, Reg n);
+ bool arm_UDIV(Cond cond, Reg d, Reg m, Reg n);
+
+ // Multiply (Normal) instructions
+ bool arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n);
+ bool arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n);
+ bool arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n);
+
+ // Multiply (Long) instructions
+ bool arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n);
+ bool arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n);
+ bool arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n);
+ bool arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n);
+ bool arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n);
+
+ // Multiply (Halfword) instructions
+ bool arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n);
+ bool arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n);
+ bool arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n);
+
+ // Multiply (word by halfword) instructions
+ bool arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n);
+ bool arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n);
+
+ // Multiply (Most significant word) instructions
+ bool arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n);
+ bool arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n);
+ bool arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n);
+
+ // Multiply (Dual) instructions
+ bool arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n);
+ bool arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n);
+ bool arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n);
+ bool arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n);
+ bool arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n);
+ bool arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n);
+
+ // Parallel Add/Subtract (Modulo arithmetic) instructions
+ bool arm_SADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SSAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SSUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SSUB16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_USAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_USUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_USUB16(Cond cond, Reg n, Reg d, Reg m);
+
+ // Parallel Add/Subtract (Saturating) instructions
+ bool arm_QADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QSAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QSUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QSUB16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQSAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m);
+
+ // Parallel Add/Subtract (Halving) instructions
+ bool arm_SHADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SHADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SHASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SHSAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHADD8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHADD16(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHASX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHSAX(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m);
+
+ // Saturated Add/Subtract instructions
+ bool arm_QADD(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QSUB(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QDADD(Cond cond, Reg n, Reg d, Reg m);
+ bool arm_QDSUB(Cond cond, Reg n, Reg d, Reg m);
+
+ // Synchronization Primitive instructions
+ bool arm_CLREX();
+ bool arm_SWP(Cond cond, Reg n, Reg t, Reg t2);
+ bool arm_SWPB(Cond cond, Reg n, Reg t, Reg t2);
+ bool arm_STL(Cond cond, Reg n, Reg t);
+ bool arm_STLEX(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_STREX(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_LDA(Cond cond, Reg n, Reg t);
+ bool arm_LDAEX(Cond cond, Reg n, Reg t);
+ bool arm_LDREX(Cond cond, Reg n, Reg t);
+ bool arm_STLEXD(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_STREXD(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_LDAEXD(Cond cond, Reg n, Reg t);
+ bool arm_LDREXD(Cond cond, Reg n, Reg t);
+ bool arm_STLB(Cond cond, Reg n, Reg t);
+ bool arm_STLEXB(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_STREXB(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_LDAB(Cond cond, Reg n, Reg t);
+ bool arm_LDAEXB(Cond cond, Reg n, Reg t);
+ bool arm_LDREXB(Cond cond, Reg n, Reg t);
+ bool arm_STLH(Cond cond, Reg n, Reg t);
+ bool arm_STLEXH(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_STREXH(Cond cond, Reg n, Reg d, Reg t);
+ bool arm_LDAH(Cond cond, Reg n, Reg t);
+ bool arm_LDAEXH(Cond cond, Reg n, Reg t);
+ bool arm_LDREXH(Cond cond, Reg n, Reg t);
+
+ // Status register access instructions
+ bool arm_CPS();
+ bool arm_MRS(Cond cond, Reg d);
+ bool arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8);
+ bool arm_MSR_reg(Cond cond, unsigned mask, Reg n);
+ bool arm_RFE();
+ bool arm_SETEND(bool E);
+ bool arm_SRS();
+
+ // thumb16
+ bool thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d);
+ bool thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d);
+ bool thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d);
+ bool thumb16_ADD_reg_t1(Reg m, Reg n, Reg d);
+ bool thumb16_SUB_reg(Reg m, Reg n, Reg d);
+ bool thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d);
+ bool thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d);
+ bool thumb16_MOV_imm(Reg d, Imm<8> imm8);
+ bool thumb16_CMP_imm(Reg n, Imm<8> imm8);
+ bool thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8);
+ bool thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8);
+ bool thumb16_AND_reg(Reg m, Reg d_n);
+ bool thumb16_EOR_reg(Reg m, Reg d_n);
+ bool thumb16_LSL_reg(Reg m, Reg d_n);
+ bool thumb16_LSR_reg(Reg m, Reg d_n);
+ bool thumb16_ASR_reg(Reg m, Reg d_n);
+ bool thumb16_ADC_reg(Reg m, Reg d_n);
+ bool thumb16_SBC_reg(Reg m, Reg d_n);
+ bool thumb16_ROR_reg(Reg m, Reg d_n);
+ bool thumb16_TST_reg(Reg m, Reg n);
+ bool thumb16_RSB_imm(Reg n, Reg d);
+ bool thumb16_CMP_reg_t1(Reg m, Reg n);
+ bool thumb16_CMN_reg(Reg m, Reg n);
+ bool thumb16_ORR_reg(Reg m, Reg d_n);
+ bool thumb16_MUL_reg(Reg n, Reg d_m);
+ bool thumb16_BIC_reg(Reg m, Reg d_n);
+ bool thumb16_MVN_reg(Reg m, Reg d);
+ bool thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo);
+ bool thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo);
+ bool thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo);
+ bool thumb16_LDR_literal(Reg t, Imm<8> imm8);
+ bool thumb16_STR_reg(Reg m, Reg n, Reg t);
+ bool thumb16_STRH_reg(Reg m, Reg n, Reg t);
+ bool thumb16_STRB_reg(Reg m, Reg n, Reg t);
+ bool thumb16_LDRSB_reg(Reg m, Reg n, Reg t);
+ bool thumb16_LDR_reg(Reg m, Reg n, Reg t);
+ bool thumb16_LDRH_reg(Reg m, Reg n, Reg t);
+ bool thumb16_LDRB_reg(Reg m, Reg n, Reg t);
+ bool thumb16_LDRSH_reg(Reg m, Reg n, Reg t);
+ bool thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t);
+ bool thumb16_STR_imm_t2(Reg t, Imm<8> imm8);
+ bool thumb16_LDR_imm_t2(Reg t, Imm<8> imm8);
+ bool thumb16_ADR(Reg d, Imm<8> imm8);
+ bool thumb16_ADD_sp_t1(Reg d, Imm<8> imm8);
+ bool thumb16_ADD_sp_t2(Imm<7> imm7);
+ bool thumb16_SUB_sp(Imm<7> imm7);
+ bool thumb16_SEV();
+ bool thumb16_SEVL();
+ bool thumb16_WFE();
+ bool thumb16_WFI();
+ bool thumb16_YIELD();
+ bool thumb16_NOP();
+ bool thumb16_IT(Imm<8> imm8);
+ bool thumb16_SXTH(Reg m, Reg d);
+ bool thumb16_SXTB(Reg m, Reg d);
+ bool thumb16_UXTH(Reg m, Reg d);
+ bool thumb16_UXTB(Reg m, Reg d);
+ bool thumb16_PUSH(bool M, RegList reg_list);
+ bool thumb16_POP(bool P, RegList reg_list);
+ bool thumb16_SETEND(bool E);
+ bool thumb16_CPS(bool, bool, bool, bool);
+ bool thumb16_REV(Reg m, Reg d);
+ bool thumb16_REV16(Reg m, Reg d);
+ bool thumb16_REVSH(Reg m, Reg d);
+ bool thumb16_BKPT(Imm<8> imm8);
+ bool thumb16_STMIA(Reg n, RegList reg_list);
+ bool thumb16_LDMIA(Reg n, RegList reg_list);
+ bool thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n);
+ bool thumb16_UDF();
+ bool thumb16_BX(Reg m);
+ bool thumb16_BLX_reg(Reg m);
+ bool thumb16_SVC(Imm<8> imm8);
+ bool thumb16_B_t1(Cond cond, Imm<8> imm8);
+ bool thumb16_B_t2(Imm<11> imm11);
+
+ // thumb32 load/store multiple instructions
+ bool thumb32_LDMDB(bool W, Reg n, Imm<16> reg_list);
+ bool thumb32_LDMIA(bool W, Reg n, Imm<16> reg_list);
+ bool thumb32_POP(Imm<16> reg_list);
+ bool thumb32_PUSH(Imm<15> reg_list);
+ bool thumb32_STMIA(bool W, Reg n, Imm<15> reg_list);
+ bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list);
+
+ // thumb32 load/store dual, load/store exclusive, table branch instructions
+ bool thumb32_LDA(Reg n, Reg t);
+ bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8);
+ bool thumb32_LDREX(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_LDREXD(Reg n, Reg t, Reg t2);
+ bool thumb32_LDREXB(Reg n, Reg t);
+ bool thumb32_LDREXH(Reg n, Reg t);
+ bool thumb32_STL(Reg n, Reg t);
+ bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8);
+ bool thumb32_STREXB(Reg n, Reg t, Reg d);
+ bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d);
+ bool thumb32_STREXH(Reg n, Reg t, Reg d);
+ bool thumb32_TBB(Reg n, Reg m);
+ bool thumb32_TBH(Reg n, Reg m);
+
+ // thumb32 data processing (shifted register) instructions
+ bool thumb32_TST_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_AND_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_BIC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_MOV_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_ORR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_MVN_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_ORN_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_TEQ_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_EOR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_PKH(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<1> tb, Reg m);
+ bool thumb32_CMN_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_ADD_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_ADC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_SBC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_CMP_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_SUB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+ bool thumb32_RSB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);
+
+ // thumb32 data processing (modified immediate) instructions
+ bool thumb32_TST_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8);
+ bool thumb32_AND_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_BIC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_MOV_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_ORR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_MVN_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_ORN_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_TEQ_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8);
+ bool thumb32_EOR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_CMN_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8);
+ bool thumb32_ADD_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_ADC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_SBC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_CMP_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8);
+ bool thumb32_SUB_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_RSB_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+
+ // thumb32 data processing (plain binary immediate) instructions.
+ bool thumb32_ADR_t2(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_ADR_t3(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_ADD_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_BFC(Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb);
+ bool thumb32_BFI(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb);
+ bool thumb32_MOVT(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_MOVW_imm(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_SBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1);
+ bool thumb32_SSAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm);
+ bool thumb32_SSAT16(Reg n, Reg d, Imm<4> sat_imm);
+ bool thumb32_SUB_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8);
+ bool thumb32_UBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1);
+ bool thumb32_USAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm);
+ bool thumb32_USAT16(Reg n, Reg d, Imm<4> sat_imm);
+
+ // thumb32 miscellaneous control instructions
+ bool thumb32_BXJ(Reg m);
+ bool thumb32_CLREX();
+ bool thumb32_DMB(Imm<4> option);
+ bool thumb32_DSB(Imm<4> option);
+ bool thumb32_ISB(Imm<4> option);
+ bool thumb32_NOP();
+ bool thumb32_SEV();
+ bool thumb32_SEVL();
+ bool thumb32_UDF();
+ bool thumb32_WFE();
+ bool thumb32_WFI();
+ bool thumb32_YIELD();
+ bool thumb32_MSR_reg(bool R, Reg n, Imm<4> mask);
+ bool thumb32_MRS_reg(bool R, Reg d);
+
+ // thumb32 branch instructions
+ bool thumb32_BL_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo);
+ bool thumb32_BLX_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo);
+ bool thumb32_B(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo);
+ bool thumb32_B_cond(Imm<1> S, Cond cond, Imm<6> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo);
+
+ // thumb32 store single data item instructions
+ bool thumb32_STRB_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8);
+ bool thumb32_STRB_imm_2(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STRB_imm_3(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_STRBT(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STRB(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_STRH_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8);
+ bool thumb32_STRH_imm_2(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STRH_imm_3(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_STRHT(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STRH(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_STR_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8);
+ bool thumb32_STR_imm_2(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STR_imm_3(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_STRT(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_STR_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+
+ // thumb32 load byte and memory hints
+ bool thumb32_PLD_lit(bool U, Imm<12> imm12);
+ bool thumb32_PLD_imm8(bool W, Reg n, Imm<8> imm8);
+ bool thumb32_PLD_imm12(bool W, Reg n, Imm<12> imm12);
+ bool thumb32_PLD_reg(bool W, Reg n, Imm<2> imm2, Reg m);
+ bool thumb32_PLI_lit(bool U, Imm<12> imm12);
+ bool thumb32_PLI_imm8(Reg n, Imm<8> imm8);
+ bool thumb32_PLI_imm12(Reg n, Imm<12> imm12);
+ bool thumb32_PLI_reg(Reg n, Imm<2> imm2, Reg m);
+ bool thumb32_LDRB_lit(bool U, Reg t, Imm<12> imm12);
+ bool thumb32_LDRB_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_LDRB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8);
+ bool thumb32_LDRB_imm12(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_LDRBT(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_LDRSB_lit(bool U, Reg t, Imm<12> imm12);
+ bool thumb32_LDRSB_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_LDRSB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8);
+ bool thumb32_LDRSB_imm12(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_LDRSBT(Reg n, Reg t, Imm<8> imm8);
+
+ // thumb32 load halfword instructions
+ bool thumb32_LDRH_lit(bool U, Reg t, Imm<12> imm12);
+ bool thumb32_LDRH_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_LDRH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8);
+ bool thumb32_LDRH_imm12(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_LDRHT(Reg n, Reg t, Imm<8> imm8);
+ bool thumb32_LDRSH_lit(bool U, Reg t, Imm<12> imm12);
+ bool thumb32_LDRSH_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_LDRSH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8);
+ bool thumb32_LDRSH_imm12(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_LDRSHT(Reg n, Reg t, Imm<8> imm8);
+
+ // thumb32 load word instructions
+ bool thumb32_LDR_lit(bool U, Reg t, Imm<12> imm12);
+ bool thumb32_LDR_reg(Reg n, Reg t, Imm<2> imm2, Reg m);
+ bool thumb32_LDR_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8);
+ bool thumb32_LDR_imm12(Reg n, Reg t, Imm<12> imm12);
+ bool thumb32_LDRT(Reg n, Reg t, Imm<8> imm8);
+
+ // thumb32 data processing (register) instructions
+ bool thumb32_ASR_reg(bool S, Reg m, Reg d, Reg s);
+ bool thumb32_LSL_reg(bool S, Reg m, Reg d, Reg s);
+ bool thumb32_LSR_reg(bool S, Reg m, Reg d, Reg s);
+ bool thumb32_ROR_reg(bool S, Reg m, Reg d, Reg s);
+ bool thumb32_SXTB(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_SXTB16(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_SXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_SXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_SXTH(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_SXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTB(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTB16(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTH(Reg d, SignExtendRotation rotate, Reg m);
+ bool thumb32_UXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m);
+
+ // thumb32 long multiply, long multiply accumulate, and divide instructions
+ bool thumb32_SDIV(Reg n, Reg d, Reg m);
+ bool thumb32_SMLAL(Reg n, Reg dLo, Reg dHi, Reg m);
+ bool thumb32_SMLALD(Reg n, Reg dLo, Reg dHi, bool M, Reg m);
+ bool thumb32_SMLALXY(Reg n, Reg dLo, Reg dHi, bool N, bool M, Reg m);
+ bool thumb32_SMLSLD(Reg n, Reg dLo, Reg dHi, bool M, Reg m);
+ bool thumb32_SMULL(Reg n, Reg dLo, Reg dHi, Reg m);
+ bool thumb32_UDIV(Reg n, Reg d, Reg m);
+ bool thumb32_UMAAL(Reg n, Reg dLo, Reg dHi, Reg m);
+ bool thumb32_UMLAL(Reg n, Reg dLo, Reg dHi, Reg m);
+ bool thumb32_UMULL(Reg n, Reg dLo, Reg dHi, Reg m);
+
+ // thumb32 miscellaneous instructions
+ bool thumb32_CLZ(Reg n, Reg d, Reg m);
+ bool thumb32_QADD(Reg n, Reg d, Reg m);
+ bool thumb32_QDADD(Reg n, Reg d, Reg m);
+ bool thumb32_QDSUB(Reg n, Reg d, Reg m);
+ bool thumb32_QSUB(Reg n, Reg d, Reg m);
+ bool thumb32_RBIT(Reg n, Reg d, Reg m);
+ bool thumb32_REV(Reg n, Reg d, Reg m);
+ bool thumb32_REV16(Reg n, Reg d, Reg m);
+ bool thumb32_REVSH(Reg n, Reg d, Reg m);
+ bool thumb32_SEL(Reg n, Reg d, Reg m);
+
+ // thumb32 multiply instructions
+ bool thumb32_MLA(Reg n, Reg a, Reg d, Reg m);
+ bool thumb32_MLS(Reg n, Reg a, Reg d, Reg m);
+ bool thumb32_MUL(Reg n, Reg d, Reg m);
+ bool thumb32_SMLAD(Reg n, Reg a, Reg d, bool X, Reg m);
+ bool thumb32_SMLAXY(Reg n, Reg a, Reg d, bool N, bool M, Reg m);
+ bool thumb32_SMLAWY(Reg n, Reg a, Reg d, bool M, Reg m);
+ bool thumb32_SMLSD(Reg n, Reg a, Reg d, bool X, Reg m);
+ bool thumb32_SMMLA(Reg n, Reg a, Reg d, bool R, Reg m);
+ bool thumb32_SMMLS(Reg n, Reg a, Reg d, bool R, Reg m);
+ bool thumb32_SMMUL(Reg n, Reg d, bool R, Reg m);
+ bool thumb32_SMUAD(Reg n, Reg d, bool M, Reg m);
+ bool thumb32_SMUSD(Reg n, Reg d, bool M, Reg m);
+ bool thumb32_SMULXY(Reg n, Reg d, bool N, bool M, Reg m);
+ bool thumb32_SMULWY(Reg n, Reg d, bool M, Reg m);
+ bool thumb32_USAD8(Reg n, Reg d, Reg m);
+ bool thumb32_USADA8(Reg n, Reg a, Reg d, Reg m);
+
+ // thumb32 parallel add/sub instructions
+ bool thumb32_SADD8(Reg n, Reg d, Reg m);
+ bool thumb32_SADD16(Reg n, Reg d, Reg m);
+ bool thumb32_SASX(Reg n, Reg d, Reg m);
+ bool thumb32_SSAX(Reg n, Reg d, Reg m);
+ bool thumb32_SSUB8(Reg n, Reg d, Reg m);
+ bool thumb32_SSUB16(Reg n, Reg d, Reg m);
+ bool thumb32_UADD8(Reg n, Reg d, Reg m);
+ bool thumb32_UADD16(Reg n, Reg d, Reg m);
+ bool thumb32_UASX(Reg n, Reg d, Reg m);
+ bool thumb32_USAX(Reg n, Reg d, Reg m);
+ bool thumb32_USUB8(Reg n, Reg d, Reg m);
+ bool thumb32_USUB16(Reg n, Reg d, Reg m);
+
+ bool thumb32_QADD8(Reg n, Reg d, Reg m);
+ bool thumb32_QADD16(Reg n, Reg d, Reg m);
+ bool thumb32_QASX(Reg n, Reg d, Reg m);
+ bool thumb32_QSAX(Reg n, Reg d, Reg m);
+ bool thumb32_QSUB8(Reg n, Reg d, Reg m);
+ bool thumb32_QSUB16(Reg n, Reg d, Reg m);
+ bool thumb32_UQADD8(Reg n, Reg d, Reg m);
+ bool thumb32_UQADD16(Reg n, Reg d, Reg m);
+ bool thumb32_UQASX(Reg n, Reg d, Reg m);
+ bool thumb32_UQSAX(Reg n, Reg d, Reg m);
+ bool thumb32_UQSUB8(Reg n, Reg d, Reg m);
+ bool thumb32_UQSUB16(Reg n, Reg d, Reg m);
+
+ bool thumb32_SHADD8(Reg n, Reg d, Reg m);
+ bool thumb32_SHADD16(Reg n, Reg d, Reg m);
+ bool thumb32_SHASX(Reg n, Reg d, Reg m);
+ bool thumb32_SHSAX(Reg n, Reg d, Reg m);
+ bool thumb32_SHSUB8(Reg n, Reg d, Reg m);
+ bool thumb32_SHSUB16(Reg n, Reg d, Reg m);
+ bool thumb32_UHADD8(Reg n, Reg d, Reg m);
+ bool thumb32_UHADD16(Reg n, Reg d, Reg m);
+ bool thumb32_UHASX(Reg n, Reg d, Reg m);
+ bool thumb32_UHSAX(Reg n, Reg d, Reg m);
+ bool thumb32_UHSUB8(Reg n, Reg d, Reg m);
+ bool thumb32_UHSUB16(Reg n, Reg d, Reg m);
+
+ // thumb32 coprocessor insturctions
+ bool thumb32_MCRR(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm);
+ bool thumb32_MRRC(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm);
+ bool thumb32_STC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8);
+ bool thumb32_LDC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8);
+ bool thumb32_CDP(bool two, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm);
+ bool thumb32_MCR(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm);
+ bool thumb32_MRC(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm);
+
+ // Floating-point three-register data processing instructions
+ bool vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+ bool vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm);
+
+ // Floating-point move instructions
+ bool vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D);
+ bool vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N);
+ bool vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N);
+ bool vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N);
+ bool vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm);
+ bool vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm);
+ bool vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm);
+ bool vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm);
+ bool vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D);
+ bool vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2);
+ bool vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2);
+ bool vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N);
+ bool vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2);
+ bool vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2);
+ bool vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E);
+ bool vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L);
+ bool vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+
+ // Floating-point misc instructions
+ bool vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm);
+ bool vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E);
+ bool vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm);
+ bool vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4);
+ bool vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm);
+ bool vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm);
+ bool vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4);
+ bool vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm);
+ bool vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm);
+
+ // Floating-point system register access
+ bool vfp_VMSR(Cond cond, Reg t);
+ bool vfp_VMRS(Cond cond, Reg t);
+
+ // Floating-point load-store instructions
+ bool vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8);
+ bool vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8);
+ bool vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8);
+ bool vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8);
+ bool vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8);
+ bool vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8);
+ bool vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8);
+ bool vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8);
+
+ // Advanced SIMD one register, modified immediate
+ bool asimd_VMOV_imm(Imm<1> a, bool D, Imm<1> b, Imm<1> c, Imm<1> d, size_t Vd, Imm<4> cmode, bool Q, bool op, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h);
+
+ // Advanced SIMD three register with same length
+ bool asimd_VHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VQADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VRHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VAND_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VBIC_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VORR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VORN_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VEOR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VBSL(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VBIT(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VBIF(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VHSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VQSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCGT_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCGE_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VABD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VABA(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VADD_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VSUB_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VQSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMAX(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm);
+ bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCEQ_reg(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMLA(bool op, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VPMAX_int(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm);
+ bool v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VQDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VQRDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VPADD(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VFMA(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VFMS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VSUB_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VPADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VABD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMLA_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMLS_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCEQ_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCGE_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VCGT_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VPMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+ bool v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+
+ // Advanced SIMD three registers with different lengths
+ bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
+ bool asimd_VSUBL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
+ bool asimd_VABAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+ bool asimd_VABDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+ bool asimd_VMLAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
+ bool asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool P, bool N, bool M, size_t Vm);
+
+ // Advanced SIMD two registers and a scalar
+ bool asimd_VMLA_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool F, bool N, bool M, size_t Vm);
+ bool asimd_VMLAL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
+ bool asimd_VMUL_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm);
+ bool asimd_VMULL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+ bool asimd_VQDMULL_scalar(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+ bool asimd_VQDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+ bool asimd_VQRDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm);
+
+ // Two registers and a shift amount
+ bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_SRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VRSHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VRSRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VSRI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VSHL(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VSLI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VQSHL(bool U, bool D, size_t imm6, size_t Vd, bool op, bool L, bool Q, bool M, size_t Vm);
+ bool asimd_VSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VRSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQRSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQRSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VSHLL(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm);
+ bool asimd_VCVT_fixed(bool U, bool D, size_t imm6, size_t Vd, bool to_fixed, bool Q, bool M, size_t Vm);
+
+ // Advanced SIMD two register, miscellaneous
+ bool asimd_VREV(bool D, size_t sz, size_t Vd, size_t op, bool Q, bool M, size_t Vm);
+ bool asimd_VPADDL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool v8_AESD(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VMVN_reg(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VPADAL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool asimd_VQABS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VQNEG(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VCGT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VCGE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VCEQ_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VCLE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VCLT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VABS(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VNEG(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VSWP(bool D, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VTRN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VUZP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VZIP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VMOVN(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQMOVUN(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool asimd_VQMOVN(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm);
+ bool asimd_VSHLL_max(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+ bool v8_VRINTN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool v8_VRINTX(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool v8_VRINTA(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool v8_VRINTZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool v8_VRINTM(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool v8_VRINTP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
+ bool asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm);
+ bool v8_VCVTA(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool v8_VCVTN(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool v8_VCVTP(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool v8_VCVTM(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm);
+ bool asimd_VRECPE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VRSQRTE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm);
+ bool asimd_VCVT_integer(bool D, size_t sz, size_t Vd, bool op, bool U, bool Q, bool M, size_t Vm);
+
+ // Advanced SIMD miscellaneous
+ bool asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm);
+ bool asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
+ bool asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm);
+ bool asimd_VDUP_scalar(bool D, Imm<4> imm4, size_t Vd, bool Q, bool M, size_t Vm);
+
+ // Advanced SIMD load/store structures
+ bool v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m);
+ bool v8_VLD_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m);
+ bool v8_VLD_all_lanes(bool D, Reg n, size_t Vd, size_t nn, size_t sz, bool T, bool a, Reg m);
+ bool v8_VST_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m);
+ bool v8_VLD_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m);
+};
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp
new file mode 100644
index 0000000000..d444e023e9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp
@@ -0,0 +1,375 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+#include <tuple>
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+namespace {
+
+std::optional<std::tuple<size_t, size_t, size_t>> DecodeType(Imm<4> type, size_t size, size_t align) {
+ switch (type.ZeroExtend()) {
+ case 0b0111: // VST1 A1 / VLD1 A1
+ if (mcl::bit::get_bit<1>(align)) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{1, 1, 0};
+ case 0b1010: // VST1 A2 / VLD1 A2
+ if (align == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{1, 2, 0};
+ case 0b0110: // VST1 A3 / VLD1 A3
+ if (mcl::bit::get_bit<1>(align)) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{1, 3, 0};
+ case 0b0010: // VST1 A4 / VLD1 A4
+ return std::tuple<size_t, size_t, size_t>{1, 4, 0};
+ case 0b1000: // VST2 A1 / VLD2 A1
+ if (size == 0b11 || align == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{2, 1, 1};
+ case 0b1001: // VST2 A1 / VLD2 A1
+ if (size == 0b11 || align == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{2, 1, 2};
+ case 0b0011: // VST2 A2 / VLD2 A2
+ if (size == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{2, 2, 2};
+ case 0b0100: // VST3 / VLD3
+ if (size == 0b11 || mcl::bit::get_bit<1>(align)) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{3, 1, 1};
+ case 0b0101: // VST3 / VLD3
+ if (size == 0b11 || mcl::bit::get_bit<1>(align)) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{3, 1, 2};
+ case 0b0000: // VST4 / VLD4
+ if (size == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{4, 1, 1};
+ case 0b0001: // VST4 / VLD4
+ if (size == 0b11) {
+ return std::nullopt;
+ }
+ return std::tuple<size_t, size_t, size_t>{4, 1, 2};
+ }
+ ASSERT_FALSE("Decode error");
+}
+} // namespace
+
+bool TranslatorVisitor::v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t size, size_t align, Reg m) {
+ if (type == 0b1011 || type.Bits<2, 3>() == 0b11) {
+ return DecodeError();
+ }
+
+ const auto decoded_type = DecodeType(type, size, align);
+ if (!decoded_type) {
+ return UndefinedInstruction();
+ }
+ const auto [nelem, regs, inc] = *decoded_type;
+
+ const ExtReg d = ToExtRegD(Vd, D);
+ const size_t d_last = RegNumber(d) + inc * (nelem - 1);
+ if (n == Reg::R15 || d_last + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ [[maybe_unused]] const size_t alignment = align == 0 ? 1 : 4 << align;
+ const size_t ebytes = static_cast<size_t>(1) << size;
+ const size_t elements = 8 / ebytes;
+
+ const bool wback = m != Reg::R15;
+ const bool register_index = m != Reg::R15 && m != Reg::R13;
+
+ IR::U32 address = ir.GetRegister(n);
+ for (size_t r = 0; r < regs; r++) {
+ for (size_t e = 0; e < elements; e++) {
+ for (size_t i = 0; i < nelem; i++) {
+ const ExtReg ext_reg = d + i * inc + r;
+ const IR::U64 shifted_element = ir.LogicalShiftRight(ir.GetExtendedRegister(ext_reg), ir.Imm8(static_cast<u8>(e * ebytes * 8)));
+ const IR::UAny element = ir.LeastSignificant(8 * ebytes, shifted_element);
+ ir.WriteMemory(8 * ebytes, address, element, IR::AccType::NORMAL);
+
+ address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes)));
+ }
+ }
+ }
+
+ if (wback) {
+ if (register_index) {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m)));
+ } else {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(8 * nelem * regs))));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::v8_VLD_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t size, size_t align, Reg m) {
+ if (type == 0b1011 || type.Bits<2, 3>() == 0b11) {
+ return DecodeError();
+ }
+
+ const auto decoded_type = DecodeType(type, size, align);
+ if (!decoded_type) {
+ return UndefinedInstruction();
+ }
+ const auto [nelem, regs, inc] = *decoded_type;
+
+ const ExtReg d = ToExtRegD(Vd, D);
+ const size_t d_last = RegNumber(d) + inc * (nelem - 1);
+ if (n == Reg::R15 || d_last + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ [[maybe_unused]] const size_t alignment = align == 0 ? 1 : 4 << align;
+ const size_t ebytes = static_cast<size_t>(1) << size;
+ const size_t elements = 8 / ebytes;
+
+ const bool wback = m != Reg::R15;
+ const bool register_index = m != Reg::R15 && m != Reg::R13;
+
+ for (size_t r = 0; r < regs; r++) {
+ for (size_t i = 0; i < nelem; i++) {
+ const ExtReg ext_reg = d + i * inc + r;
+ ir.SetExtendedRegister(ext_reg, ir.Imm64(0));
+ }
+ }
+
+ IR::U32 address = ir.GetRegister(n);
+ for (size_t r = 0; r < regs; r++) {
+ for (size_t e = 0; e < elements; e++) {
+ for (size_t i = 0; i < nelem; i++) {
+ const IR::U64 element = ir.ZeroExtendToLong(ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL));
+ const IR::U64 shifted_element = ir.LogicalShiftLeft(element, ir.Imm8(static_cast<u8>(e * ebytes * 8)));
+
+ const ExtReg ext_reg = d + i * inc + r;
+ ir.SetExtendedRegister(ext_reg, ir.Or(ir.GetExtendedRegister(ext_reg), shifted_element));
+
+ address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes)));
+ }
+ }
+ }
+
+ if (wback) {
+ if (register_index) {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m)));
+ } else {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(8 * nelem * regs))));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::v8_VLD_all_lanes(bool D, Reg n, size_t Vd, size_t nn, size_t sz, bool T, bool a, Reg m) {
+ const size_t nelem = nn + 1;
+
+ if (nelem == 1 && (sz == 0b11 || (sz == 0b00 && a))) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 2 && sz == 0b11) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 3 && (sz == 0b11 || a)) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 4 && (sz == 0b11 && !a)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t ebytes = sz == 0b11 ? 4 : (1 << sz);
+ const size_t inc = T ? 2 : 1;
+ const size_t regs = nelem == 1 ? inc : 1;
+ [[maybe_unused]] const size_t alignment = [&]() -> size_t {
+ if (a && nelem == 1) {
+ return ebytes;
+ }
+ if (a && nelem == 2) {
+ return ebytes * 2;
+ }
+ if (a && nelem == 4) {
+ return sz >= 0b10 ? 2 * ebytes : 4 * ebytes;
+ }
+ return 1;
+ }();
+
+ const ExtReg d = ToExtRegD(Vd, D);
+ const size_t d_last = RegNumber(d) + inc * (nelem - 1);
+ if (n == Reg::R15 || d_last + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ const bool wback = m != Reg::R15;
+ const bool register_index = m != Reg::R15 && m != Reg::R13;
+
+ auto address = ir.GetRegister(n);
+ for (size_t i = 0; i < nelem; i++) {
+ const auto element = ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL);
+ const auto replicated_element = ir.VectorBroadcast(ebytes * 8, element);
+
+ for (size_t r = 0; r < regs; r++) {
+ const ExtReg ext_reg = d + i * inc + r;
+ ir.SetVector(ext_reg, replicated_element);
+ }
+
+ address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes)));
+ }
+
+ if (wback) {
+ if (register_index) {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m)));
+ } else {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes))));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::v8_VST_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m) {
+ const size_t nelem = nn + 1;
+
+ if (sz == 0b11) {
+ return DecodeError();
+ }
+
+ if (nelem == 1 && mcl::bit::get_bit(sz, index_align)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t ebytes = size_t(1) << sz;
+ const size_t index = mcl::bit::get_bits(sz + 1, 3, index_align);
+ const size_t inc = (sz != 0 && mcl::bit::get_bit(sz, index_align)) ? 2 : 1;
+ const size_t a = mcl::bit::get_bits(0, sz ? sz - 1 : 0, index_align);
+
+ if (nelem == 1 && inc == 2) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 1 && sz == 2 && (a != 0b00 && a != 0b11)) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 2 && mcl::bit::get_bit<1>(a)) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 3 && a != 0b00) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 4 && a == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ // TODO: alignment
+
+ const ExtReg d = ToExtRegD(Vd, D);
+ const size_t d_last = RegNumber(d) + inc * (nelem - 1);
+ if (n == Reg::R15 || d_last + 1 > 32) {
+ return UnpredictableInstruction();
+ }
+
+ const bool wback = m != Reg::R15;
+ const bool register_index = m != Reg::R15 && m != Reg::R13;
+
+ auto address = ir.GetRegister(n);
+ for (size_t i = 0; i < nelem; i++) {
+ const ExtReg ext_reg = d + i * inc;
+ const auto element = ir.VectorGetElement(ebytes * 8, ir.GetVector(ext_reg), index);
+
+ ir.WriteMemory(ebytes * 8, address, element, IR::AccType::NORMAL);
+
+ address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes)));
+ }
+
+ if (wback) {
+ if (register_index) {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m)));
+ } else {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes))));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::v8_VLD_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m) {
+ const size_t nelem = nn + 1;
+
+ if (sz == 0b11) {
+ return DecodeError();
+ }
+
+ if (nelem == 1 && mcl::bit::get_bit(sz, index_align)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t ebytes = size_t(1) << sz;
+ const size_t index = mcl::bit::get_bits(sz + 1, 3, index_align);
+ const size_t inc = (sz != 0 && mcl::bit::get_bit(sz, index_align)) ? 2 : 1;
+ const size_t a = mcl::bit::get_bits(0, sz ? sz - 1 : 0, index_align);
+
+ if (nelem == 1 && inc == 2) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 1 && sz == 2 && (a != 0b00 && a != 0b11)) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 2 && mcl::bit::get_bit<1>(a)) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 3 && a != 0b00) {
+ return UndefinedInstruction();
+ }
+ if (nelem == 4 && a == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ // TODO: alignment
+
+ const ExtReg d = ToExtRegD(Vd, D);
+ const size_t d_last = RegNumber(d) + inc * (nelem - 1);
+ if (n == Reg::R15 || d_last + 1 > 32) {
+ return UnpredictableInstruction();
+ }
+
+ const bool wback = m != Reg::R15;
+ const bool register_index = m != Reg::R15 && m != Reg::R13;
+
+ auto address = ir.GetRegister(n);
+ for (size_t i = 0; i < nelem; i++) {
+ const auto element = ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL);
+
+ const ExtReg ext_reg = d + i * inc;
+ const auto new_reg = ir.VectorSetElement(ebytes * 8, ir.GetVector(ext_reg), index, element);
+ ir.SetVector(ext_reg, new_reg);
+
+ address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes)));
+ }
+
+ if (wback) {
+ if (register_index) {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m)));
+ } else {
+ ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes))));
+ }
+ }
+
+ return true;
+}
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp
new file mode 100644
index 0000000000..e30c438eff
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp
@@ -0,0 +1,93 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <vector>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+static bool TableLookup(TranslatorVisitor& v, bool is_vtbl, bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
+ const size_t length = len + 1;
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(false, Vn, N);
+
+ if (RegNumber(n) + length > 32) {
+ return v.UnpredictableInstruction();
+ }
+
+ const IR::Table table = v.ir.VectorTable([&] {
+ std::vector<IR::U64> result;
+ for (size_t i = 0; i < length; ++i) {
+ result.emplace_back(v.ir.GetExtendedRegister(n + i));
+ }
+ return result;
+ }());
+ const IR::U64 indicies = v.ir.GetExtendedRegister(m);
+ const IR::U64 defaults = is_vtbl ? v.ir.Imm64(0) : IR::U64{v.ir.GetExtendedRegister(d)};
+ const IR::U64 result = v.ir.VectorTableLookup(defaults, table, indicies);
+
+ v.ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (!Q && imm4.Bit<3>()) {
+ return UndefinedInstruction();
+ }
+
+ const u8 position = 8 * imm4.ZeroExtend<u8>();
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = Q ? ir.VectorExtract(reg_n, reg_m, position) : ir.VectorExtractLower(reg_n, reg_m, position);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
+ return TableLookup(*this, true, D, Vn, Vd, len, N, M, Vm);
+}
+
+bool TranslatorVisitor::asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) {
+ return TableLookup(*this, false, D, Vn, Vd, len, N, M, Vm);
+}
+
+bool TranslatorVisitor::asimd_VDUP_scalar(bool D, Imm<4> imm4, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (Q && mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ if (imm4.Bits<0, 2>() == 0b000) {
+ return UndefinedInstruction();
+ }
+
+ const size_t imm4_lsb = mcl::bit::lowest_set_bit(imm4.ZeroExtend());
+ const size_t esize = 8u << imm4_lsb;
+ const size_t index = imm4.ZeroExtend() >> (imm4_lsb + 1);
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorBroadcastElement(esize, reg_m, index);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp
new file mode 100644
index 0000000000..94e3e841e8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp
@@ -0,0 +1,113 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::asimd_VMOV_imm(Imm<1> a, bool D, Imm<1> b, Imm<1> c, Imm<1> d, size_t Vd, Imm<4> cmode, bool Q, bool op, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h) {
+ if (Q && mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d_reg = ToVector(Q, Vd, D);
+ const auto imm = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h));
+
+ // VMOV
+ const auto mov = [&] {
+ const auto imm64 = ir.Imm64(imm);
+ if (Q) {
+ ir.SetVector(d_reg, ir.VectorBroadcast(64, imm64));
+ } else {
+ ir.SetExtendedRegister(d_reg, imm64);
+ }
+ return true;
+ };
+
+ // VMVN
+ // mvn is a predefined macro in arm64 MSVC
+ const auto mvn_ = [&] {
+ const auto imm64 = ir.Imm64(~imm);
+ if (Q) {
+ ir.SetVector(d_reg, ir.VectorBroadcast(64, imm64));
+ } else {
+ ir.SetExtendedRegister(d_reg, imm64);
+ }
+ return true;
+ };
+
+ // VORR
+ const auto orr = [&] {
+ const auto imm64 = ir.Imm64(imm);
+ if (Q) {
+ const auto reg_value = ir.GetVector(d_reg);
+ ir.SetVector(d_reg, ir.VectorOr(reg_value, ir.VectorBroadcast(64, imm64)));
+ } else {
+ const auto reg_value = ir.GetExtendedRegister(d_reg);
+ ir.SetExtendedRegister(d_reg, ir.Or(reg_value, imm64));
+ }
+ return true;
+ };
+
+ // VBIC
+ const auto bic = [&] {
+ const auto imm64 = ir.Imm64(~imm);
+ if (Q) {
+ const auto reg_value = ir.GetVector(d_reg);
+ ir.SetVector(d_reg, ir.VectorAnd(reg_value, ir.VectorBroadcast(64, imm64)));
+ } else {
+ const auto reg_value = ir.GetExtendedRegister(d_reg);
+ ir.SetExtendedRegister(d_reg, ir.And(reg_value, imm64));
+ }
+ return true;
+ };
+
+ switch (concatenate(cmode, Imm<1>{op}).ZeroExtend()) {
+ case 0b00000:
+ case 0b00100:
+ case 0b01000:
+ case 0b01100:
+ case 0b10000:
+ case 0b10100:
+ case 0b11000:
+ case 0b11010:
+ case 0b11100:
+ case 0b11101:
+ case 0b11110:
+ return mov();
+ case 0b11111:
+ return UndefinedInstruction();
+ case 0b00001:
+ case 0b00101:
+ case 0b01001:
+ case 0b01101:
+ case 0b10001:
+ case 0b10101:
+ case 0b11001:
+ case 0b11011:
+ return mvn_();
+ case 0b00010:
+ case 0b00110:
+ case 0b01010:
+ case 0b01110:
+ case 0b10010:
+ case 0b10110:
+ return orr();
+ case 0b00011:
+ case 0b00111:
+ case 0b01011:
+ case 0b01111:
+ case 0b10011:
+ case 0b10111:
+ return bic();
+ }
+
+ UNREACHABLE();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp
new file mode 100644
index 0000000000..a69f39bfb6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp
@@ -0,0 +1,973 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+enum class Comparison {
+ GE,
+ GT,
+ EQ,
+ AbsoluteGE,
+ AbsoluteGT,
+};
+
+enum class AccumulateBehavior {
+ None,
+ Accumulate,
+};
+
+enum class WidenBehaviour {
+ Second,
+ Both,
+};
+
+template<bool WithDst, typename Callable>
+bool BitwiseInstruction(TranslatorVisitor& v, bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Callable fn) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ if constexpr (WithDst) {
+ const IR::U128 reg_d = v.ir.GetVector(d);
+ const IR::U128 reg_m = v.ir.GetVector(m);
+ const IR::U128 reg_n = v.ir.GetVector(n);
+ const IR::U128 result = fn(reg_d, reg_n, reg_m);
+ v.ir.SetVector(d, result);
+ } else {
+ const IR::U128 reg_m = v.ir.GetVector(m);
+ const IR::U128 reg_n = v.ir.GetVector(n);
+ const IR::U128 result = fn(reg_n, reg_m);
+ v.ir.SetVector(d, result);
+ }
+
+ return true;
+}
+
+template<typename Callable>
+bool FloatingPointInstruction(TranslatorVisitor& v, bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Callable fn) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ if (sz == 0b1) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_d = v.ir.GetVector(d);
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = fn(reg_d, reg_n, reg_m);
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool IntegerComparison(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Comparison comparison) {
+ if (sz == 0b11) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = [&] {
+ switch (comparison) {
+ case Comparison::GT:
+ return U ? v.ir.VectorGreaterUnsigned(esize, reg_n, reg_m)
+ : v.ir.VectorGreaterSigned(esize, reg_n, reg_m);
+ case Comparison::GE:
+ return U ? v.ir.VectorGreaterEqualUnsigned(esize, reg_n, reg_m)
+ : v.ir.VectorGreaterEqualSigned(esize, reg_n, reg_m);
+ case Comparison::EQ:
+ return v.ir.VectorEqual(esize, reg_n, reg_m);
+ default:
+ return IR::U128{};
+ }
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool FloatComparison(TranslatorVisitor& v, bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Comparison comparison) {
+ if (sz) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = [&] {
+ switch (comparison) {
+ case Comparison::GE:
+ return v.ir.FPVectorGreaterEqual(32, reg_n, reg_m, false);
+ case Comparison::GT:
+ return v.ir.FPVectorGreater(32, reg_n, reg_m, false);
+ case Comparison::EQ:
+ return v.ir.FPVectorEqual(32, reg_n, reg_m, false);
+ case Comparison::AbsoluteGE:
+ return v.ir.FPVectorGreaterEqual(32, v.ir.FPVectorAbs(32, reg_n), v.ir.FPVectorAbs(32, reg_m), false);
+ case Comparison::AbsoluteGT:
+ return v.ir.FPVectorGreater(32, v.ir.FPVectorAbs(32, reg_n), v.ir.FPVectorAbs(32, reg_m), false);
+ default:
+ return IR::U128{};
+ }
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool AbsoluteDifference(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, AccumulateBehavior accumulate) {
+ if (sz == 0b11) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = v.ir.GetVector(m);
+ const auto reg_n = v.ir.GetVector(n);
+ const auto result = [&] {
+ const auto absdiff = U ? v.ir.VectorUnsignedAbsoluteDifference(esize, reg_n, reg_m)
+ : v.ir.VectorSignedAbsoluteDifference(esize, reg_n, reg_m);
+
+ if (accumulate == AccumulateBehavior::Accumulate) {
+ const auto reg_d = v.ir.GetVector(d);
+ return v.ir.VectorAdd(esize, reg_d, absdiff);
+ }
+
+ return absdiff;
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, AccumulateBehavior accumulate) {
+ if (sz == 0b11) {
+ return v.DecodeError();
+ }
+
+ if (mcl::bit::get_bit<0>(Vd)) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_m = v.ir.GetVector(m);
+ const auto reg_n = v.ir.GetVector(n);
+ const auto operand_m = v.ir.VectorZeroExtend(esize, v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(64, reg_m, 0)));
+ const auto operand_n = v.ir.VectorZeroExtend(esize, v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(64, reg_n, 0)));
+ const auto result = [&] {
+ const auto absdiff = U ? v.ir.VectorUnsignedAbsoluteDifference(esize, operand_m, operand_n)
+ : v.ir.VectorSignedAbsoluteDifference(esize, operand_m, operand_n);
+
+ if (accumulate == AccumulateBehavior::Accumulate) {
+ const auto reg_d = v.ir.GetVector(d);
+ return v.ir.VectorAdd(2 * esize, reg_d, absdiff);
+ }
+
+ return absdiff;
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+template<typename Callable>
+bool WideInstruction(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, WidenBehaviour widen_behaviour, Callable fn) {
+ const size_t esize = 8U << sz;
+ const bool widen_first = widen_behaviour == WidenBehaviour::Both;
+
+ if (sz == 0b11) {
+ return v.DecodeError();
+ }
+
+ if (mcl::bit::get_bit<0>(Vd) || (!widen_first && mcl::bit::get_bit<0>(Vn))) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(!widen_first, Vn, N);
+
+ const auto reg_d = v.ir.GetVector(d);
+ const auto reg_m = v.ir.GetVector(m);
+ const auto reg_n = v.ir.GetVector(n);
+ const auto wide_n = U ? v.ir.VectorZeroExtend(esize, reg_n) : v.ir.VectorSignExtend(esize, reg_n);
+ const auto wide_m = U ? v.ir.VectorZeroExtend(esize, reg_m) : v.ir.VectorSignExtend(esize, reg_m);
+ const auto result = fn(esize * 2, reg_d, widen_first ? wide_n : reg_n, wide_m);
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+// ASIMD Three registers of the same length
+
+bool TranslatorVisitor::asimd_VHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const IR::U128 reg_n = ir.GetVector(n);
+ const IR::U128 reg_m = ir.GetVector(m);
+ const IR::U128 result = U ? ir.VectorHalvingAddUnsigned(esize, reg_n, reg_m) : ir.VectorHalvingAddSigned(esize, reg_n, reg_m);
+ ir.SetVector(d, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const IR::U128 reg_n = ir.GetVector(n);
+ const IR::U128 reg_m = ir.GetVector(m);
+ const IR::U128 result = U ? ir.VectorUnsignedSaturatedAdd(esize, reg_n, reg_m) : ir.VectorSignedSaturatedAdd(esize, reg_n, reg_m);
+ ir.SetVector(d, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VRHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const IR::U128 reg_n = ir.GetVector(n);
+ const IR::U128 reg_m = ir.GetVector(m);
+ const IR::U128 result = U ? ir.VectorRoundingHalvingAddUnsigned(esize, reg_n, reg_m) : ir.VectorRoundingHalvingAddSigned(esize, reg_n, reg_m);
+ ir.SetVector(d, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VAND_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
+ return ir.VectorAnd(reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VBIC_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
+ return ir.VectorAndNot(reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VORR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
+ return ir.VectorOr(reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VORN_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
+ return ir.VectorOr(reg_n, ir.VectorNot(reg_m));
+ });
+}
+
+bool TranslatorVisitor::asimd_VEOR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
+ return ir.VectorEor(reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VBSL(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ return ir.VectorOr(ir.VectorAnd(reg_n, reg_d), ir.VectorAndNot(reg_m, reg_d));
+ });
+}
+
+bool TranslatorVisitor::asimd_VBIT(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ return ir.VectorOr(ir.VectorAnd(reg_n, reg_m), ir.VectorAndNot(reg_d, reg_m));
+ });
+}
+
+bool TranslatorVisitor::asimd_VBIF(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ return ir.VectorOr(ir.VectorAnd(reg_d, reg_m), ir.VectorAndNot(reg_n, reg_m));
+ });
+}
+
+bool TranslatorVisitor::asimd_VHSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const IR::U128 reg_n = ir.GetVector(n);
+ const IR::U128 reg_m = ir.GetVector(m);
+ const IR::U128 result = U ? ir.VectorHalvingSubUnsigned(esize, reg_n, reg_m) : ir.VectorHalvingSubSigned(esize, reg_n, reg_m);
+ ir.SetVector(d, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const IR::U128 reg_n = ir.GetVector(n);
+ const IR::U128 reg_m = ir.GetVector(m);
+ const IR::U128 result = U ? ir.VectorUnsignedSaturatedSub(esize, reg_n, reg_m) : ir.VectorSignedSaturatedSub(esize, reg_n, reg_m);
+ ir.SetVector(d, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCGT_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return IntegerComparison(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GT);
+}
+
+bool TranslatorVisitor::asimd_VCGE_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return IntegerComparison(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GE);
+}
+
+bool TranslatorVisitor::asimd_VABD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return AbsoluteDifference(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, AccumulateBehavior::None);
+}
+
+bool TranslatorVisitor::asimd_VABA(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return AbsoluteDifference(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, AccumulateBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::asimd_VADD_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = ir.VectorAdd(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSUB_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = ir.VectorSub(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = U ? ir.VectorLogicalVShift(esize, reg_m, reg_n)
+ : ir.VectorArithmeticVShift(esize, reg_m, reg_n);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = U ? ir.VectorUnsignedSaturatedShiftLeft(esize, reg_m, reg_n)
+ : ir.VectorSignedSaturatedShiftLeft(esize, reg_m, reg_n);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = U ? ir.VectorRoundingShiftLeftUnsigned(esize, reg_m, reg_n)
+ : ir.VectorRoundingShiftLeftSigned(esize, reg_m, reg_n);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VMAX(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto result = [&] {
+ if (op) {
+ return U ? ir.VectorMinUnsigned(esize, reg_n, reg_m)
+ : ir.VectorMinSigned(esize, reg_n, reg_m);
+ } else {
+ return U ? ir.VectorMaxUnsigned(esize, reg_n, reg_m)
+ : ir.VectorMaxSigned(esize, reg_n, reg_m);
+ }
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8 << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto anded = ir.VectorAnd(reg_n, reg_m);
+ const auto result = ir.VectorNot(ir.VectorEqual(esize, anded, ir.ZeroVector()));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCEQ_reg(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return IntegerComparison(*this, false, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::EQ);
+}
+
+bool TranslatorVisitor::asimd_VMLA(bool op, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_d = ir.GetVector(d);
+ const auto multiply = ir.VectorMultiply(esize, reg_n, reg_m);
+ const auto result = op ? ir.VectorSub(esize, reg_d, multiply)
+ : ir.VectorAdd(esize, reg_d, multiply);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11 || (P && sz != 0b00)) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = P ? ir.VectorPolynomialMultiply(reg_n, reg_m)
+ : ir.VectorMultiply(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VPMAX_int(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm) {
+ if (sz == 0b11 || Q) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+
+ const auto bottom = ir.VectorDeinterleaveEvenLower(esize, reg_n, reg_m);
+ const auto top = ir.VectorDeinterleaveOddLower(esize, reg_n, reg_m);
+
+ const auto result = [&] {
+ if (op) {
+ return U ? ir.VectorMinUnsigned(esize, bottom, top)
+ : ir.VectorMinSigned(esize, bottom, top);
+ } else {
+ return U ? ir.VectorMaxUnsigned(esize, bottom, top)
+ : ir.VectorMaxSigned(esize, bottom, top);
+ }
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMaxNumeric(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMinNumeric(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VQDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b00 || sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQRDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b00 || sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VPADD(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q || sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto n = ToVector(Q, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorPairedAddLower(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VFMA(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMulAdd(32, reg_d, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VFMS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMulAdd(32, reg_d, ir.FPVectorNeg(32, reg_n), reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorAdd(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VSUB_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorSub(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VPADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q) {
+ return UndefinedInstruction();
+ }
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorPairedAddLower(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VABD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorAbs(32, ir.FPVectorSub(32, reg_n, reg_m, false));
+ });
+}
+
+bool TranslatorVisitor::asimd_VMLA_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ const auto product = ir.FPVectorMul(32, reg_n, reg_m, false);
+ return ir.FPVectorAdd(32, reg_d, product, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VMLS_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
+ const auto product = ir.FPVectorMul(32, reg_n, reg_m, false);
+ return ir.FPVectorAdd(32, reg_d, ir.FPVectorNeg(32, product), false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMul(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VCEQ_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::EQ);
+}
+
+bool TranslatorVisitor::asimd_VCGE_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GE);
+}
+
+bool TranslatorVisitor::asimd_VCGT_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GT);
+}
+
+bool TranslatorVisitor::asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ const auto comparison = op ? Comparison::AbsoluteGT : Comparison::AbsoluteGE;
+ return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, comparison);
+}
+
+bool TranslatorVisitor::asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMax(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorMin(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VPMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q) {
+ return UndefinedInstruction();
+ }
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ const auto bottom = ir.VectorDeinterleaveEvenLower(32, reg_n, reg_m);
+ const auto top = ir.VectorDeinterleaveOddLower(32, reg_n, reg_m);
+ return ir.FPVectorMax(32, bottom, top, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (Q) {
+ return UndefinedInstruction();
+ }
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ const auto bottom = ir.VectorDeinterleaveEvenLower(32, reg_n, reg_m);
+ const auto top = ir.VectorDeinterleaveOddLower(32, reg_n, reg_m);
+ return ir.FPVectorMin(32, bottom, top, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorRecipStepFused(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.FPVectorRSqrtStepFused(32, reg_n, reg_m, false);
+ });
+}
+
+bool TranslatorVisitor::v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto n = ToVector(Q, Vn, N);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto x = ir.GetVector(d);
+ const auto y = ir.GetVector(n);
+ const auto w = ir.GetVector(m);
+ const auto result = ir.SHA256Hash(x, y, w, true);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto n = ToVector(Q, Vn, N);
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto x = ir.GetVector(n);
+ const auto y = ir.GetVector(d);
+ const auto w = ir.GetVector(m);
+ const auto result = ir.SHA256Hash(x, y, w, false);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+ if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto n = ToVector(Q, Vn, N);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto x = ir.GetVector(d);
+ const auto y = ir.GetVector(n);
+ const auto z = ir.GetVector(m);
+ const auto result = ir.SHA256MessageSchedule1(x, y, z);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+// ASIMD Three registers of different length
+
+bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
+ return WideInstruction(*this, U, D, sz, Vn, Vd, N, M, Vm, op ? WidenBehaviour::Second : WidenBehaviour::Both, [this](size_t esize, const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.VectorAdd(esize, reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VSUBL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
+ return WideInstruction(*this, U, D, sz, Vn, Vd, N, M, Vm, op ? WidenBehaviour::Second : WidenBehaviour::Both, [this](size_t esize, const auto&, const auto& reg_n, const auto& reg_m) {
+ return ir.VectorSub(esize, reg_n, reg_m);
+ });
+}
+
+bool TranslatorVisitor::asimd_VABAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ return AbsoluteDifferenceLong(*this, U, D, sz, Vn, Vd, N, M, Vm, AccumulateBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::asimd_VABDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ return AbsoluteDifferenceLong(*this, U, D, sz, Vn, Vd, N, M, Vm, AccumulateBehavior::None);
+}
+
+bool TranslatorVisitor::asimd_VMLAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
+ const size_t esize = 8U << sz;
+
+ if (sz == 0b11) {
+ return DecodeError();
+ }
+
+ if (mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_n = ir.GetVector(n);
+ const auto multiply = U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
+ : ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
+ const auto result = op ? ir.VectorSub(esize * 2, reg_d, multiply)
+ : ir.VectorAdd(esize * 2, reg_d, multiply);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool P, bool N, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return DecodeError();
+ }
+
+ if ((P & (U || sz == 0b10)) || mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = P ? (sz == 0b00 ? 8 : 64) : 8U << sz;
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = P ? ir.VectorPolynomialMultiplyLong(esize, reg_n, reg_m)
+ : U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
+ : ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp
new file mode 100644
index 0000000000..62b9af55a5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp
@@ -0,0 +1,767 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+enum class Comparison {
+ EQ,
+ GE,
+ GT,
+ LE,
+ LT,
+};
+
+bool CompareWithZero(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm, Comparison type) {
+ if (sz == 0b11 || (F && sz != 0b10)) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [&] {
+ const auto reg_m = v.ir.GetVector(m);
+ const auto zero = v.ir.ZeroVector();
+
+ if (F) {
+ switch (type) {
+ case Comparison::EQ:
+ return v.ir.FPVectorEqual(32, reg_m, zero, false);
+ case Comparison::GE:
+ return v.ir.FPVectorGreaterEqual(32, reg_m, zero, false);
+ case Comparison::GT:
+ return v.ir.FPVectorGreater(32, reg_m, zero, false);
+ case Comparison::LE:
+ return v.ir.FPVectorGreaterEqual(32, zero, reg_m, false);
+ case Comparison::LT:
+ return v.ir.FPVectorGreater(32, zero, reg_m, false);
+ }
+
+ return IR::U128{};
+ } else {
+ static constexpr std::array fns{
+ &IREmitter::VectorEqual,
+ &IREmitter::VectorGreaterEqualSigned,
+ &IREmitter::VectorGreaterSigned,
+ &IREmitter::VectorLessEqualSigned,
+ &IREmitter::VectorLessSigned,
+ };
+
+ const size_t esize = 8U << sz;
+ return (v.ir.*fns[static_cast<size_t>(type)])(esize, reg_m, zero);
+ }
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+enum class AccumulateBehavior {
+ None,
+ Accumulate,
+};
+
+bool PairedAddOperation(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm, AccumulateBehavior accumulate) {
+ if (sz == 0b11) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = [&] {
+ const auto tmp = op ? v.ir.VectorPairedAddUnsignedWiden(esize, reg_m)
+ : v.ir.VectorPairedAddSignedWiden(esize, reg_m);
+
+ if (accumulate == AccumulateBehavior::Accumulate) {
+ const auto reg_d = v.ir.GetVector(d);
+ return v.ir.VectorAdd(esize * 2, reg_d, tmp);
+ }
+
+ return tmp;
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool RoundFloatToInteger(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm, bool exact, FP::RoundingMode rounding_mode) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ if (sz != 0b10) {
+ return v.UndefinedInstruction(); // TODO: FP16
+ }
+
+ const size_t esize = 8 << sz;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = v.ir.FPVectorRoundInt(esize, reg_m, rounding_mode, exact, false);
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool ConvertFloatToInteger(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm, FP::RoundingMode rounding_mode) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ if (sz != 0b10) {
+ return v.UndefinedInstruction(); // TODO: FP16
+ }
+
+ const bool unsigned_ = op;
+ const size_t esize = 8 << sz;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = v.ir.GetVector(m);
+ const auto result = unsigned_
+ ? v.ir.FPVectorToUnsignedFixed(esize, reg_m, 0, rounding_mode, false)
+ : v.ir.FPVectorToSignedFixed(esize, reg_m, 0, rounding_mode, false);
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+bool TranslatorVisitor::asimd_VREV(bool D, size_t sz, size_t Vd, size_t op, bool Q, bool M, size_t Vm) {
+ if (op + sz >= 3) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [this, m, op, sz] {
+ const auto reg_m = ir.GetVector(m);
+ const size_t esize = 8 << sz;
+
+ switch (op) {
+ case 0b00:
+ return ir.VectorReverseElementsInLongGroups(esize, reg_m);
+ case 0b01:
+ return ir.VectorReverseElementsInWordGroups(esize, reg_m);
+ case 0b10:
+ return ir.VectorReverseElementsInHalfGroups(esize, reg_m);
+ }
+
+ UNREACHABLE();
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VPADDL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return PairedAddOperation(*this, D, sz, Vd, op, Q, M, Vm, AccumulateBehavior::None);
+}
+
+bool TranslatorVisitor::v8_AESD(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.AESDecryptSingleRound(ir.VectorEor(reg_d, reg_m));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.AESEncryptSingleRound(ir.VectorEor(reg_d, reg_m));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.AESInverseMixColumns(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.AESMixColumns(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz != 0b10 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+ const auto x = ir.GetVector(d);
+ const auto y = ir.GetVector(m);
+ const auto result = ir.SHA256MessageSchedule0(x, y);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [this, m, sz] {
+ const auto reg_m = ir.GetVector(m);
+ const size_t esize = 8U << sz;
+ const auto shifted = ir.VectorArithmeticShiftRight(esize, reg_m, static_cast<u8>(esize));
+ const auto xored = ir.VectorEor(reg_m, shifted);
+ const auto clz = ir.VectorCountLeadingZeros(esize, xored);
+ return ir.VectorSub(esize, clz, ir.VectorBroadcast(esize, I(esize, 1)));
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [this, m, sz] {
+ const auto reg_m = ir.GetVector(m);
+ const size_t esize = 8U << sz;
+
+ return ir.VectorCountLeadingZeros(esize, reg_m);
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz != 0b00) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorPopulationCount(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VMVN_reg(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz != 0b00) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorNot(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VPADAL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return PairedAddOperation(*this, D, sz, Vd, op, Q, M, Vm, AccumulateBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::asimd_VQABS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorSignedSaturatedAbs(esize, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQNEG(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorSignedSaturatedNeg(esize, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCGT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::GT);
+}
+
+bool TranslatorVisitor::asimd_VCGE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::GE);
+}
+
+bool TranslatorVisitor::asimd_VCEQ_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::EQ);
+}
+
+bool TranslatorVisitor::asimd_VCLE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::LE);
+}
+
+bool TranslatorVisitor::asimd_VCLT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::LT);
+}
+
+bool TranslatorVisitor::asimd_VABS(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11 || (F && sz != 0b10)) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [this, F, m, sz] {
+ const auto reg_m = ir.GetVector(m);
+
+ if (F) {
+ return ir.FPVectorAbs(32, reg_m);
+ }
+
+ const size_t esize = 8U << sz;
+ return ir.VectorAbs(esize, reg_m);
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VNEG(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11 || (F && sz != 0b10)) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [this, F, m, sz] {
+ const auto reg_m = ir.GetVector(m);
+
+ if (F) {
+ return ir.FPVectorNeg(32, reg_m);
+ }
+
+ const size_t esize = 8U << sz;
+ return ir.VectorSub(esize, ir.ZeroVector(), reg_m);
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSWP(bool D, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ // Swapping the same register results in the same contents.
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ if (d == m) {
+ return true;
+ }
+
+ if (Q) {
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+
+ ir.SetVector(m, reg_d);
+ ir.SetVector(d, reg_m);
+ } else {
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto reg_m = ir.GetExtendedRegister(m);
+
+ ir.SetExtendedRegister(m, reg_d);
+ ir.SetExtendedRegister(d, reg_m);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VTRN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ if (d == m) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+ const auto result_d = ir.VectorTranspose(esize, reg_d, reg_m, false);
+ const auto result_m = ir.VectorTranspose(esize, reg_d, reg_m, true);
+
+ ir.SetVector(d, result_d);
+ ir.SetVector(m, result_m);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VUZP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11 || (!Q && sz == 0b10)) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ if (d == m) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+ auto result_d = Q ? ir.VectorDeinterleaveEven(esize, reg_d, reg_m) : ir.VectorDeinterleaveEvenLower(esize, reg_d, reg_m);
+ auto result_m = Q ? ir.VectorDeinterleaveOdd(esize, reg_d, reg_m) : ir.VectorDeinterleaveOddLower(esize, reg_d, reg_m);
+
+ ir.SetVector(d, result_d);
+ ir.SetVector(m, result_m);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VZIP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ if (sz == 0b11 || (!Q && sz == 0b10)) {
+ return UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ if (d == m) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_d = ir.GetVector(d);
+ const auto reg_m = ir.GetVector(m);
+
+ if (Q) {
+ const auto result_d = ir.VectorInterleaveLower(esize, reg_d, reg_m);
+ const auto result_m = ir.VectorInterleaveUpper(esize, reg_d, reg_m);
+
+ ir.SetVector(d, result_d);
+ ir.SetVector(m, result_m);
+ } else {
+ const auto result = ir.VectorInterleaveLower(esize, reg_d, reg_m);
+
+ ir.SetExtendedRegister(d, ir.VectorGetElement(64, result, 0));
+ ir.SetExtendedRegister(m, ir.VectorGetElement(64, result, 1));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VMOVN(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorNarrow(2 * esize, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQMOVUN(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorSignedSaturatedNarrowToUnsigned(2 * esize, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQMOVN(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm) {
+ if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = op ? ir.VectorUnsignedSaturatedNarrow(2 * esize, reg_m)
+ : ir.VectorSignedSaturatedNarrowToSigned(2 * esize, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSHLL_max(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+ if (sz == 0b11 || mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorLogicalShiftLeft(2 * esize, ir.VectorZeroExtend(esize, reg_m), static_cast<u8>(esize));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_VRINTN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::ToNearest_TieEven);
+}
+bool TranslatorVisitor::v8_VRINTX(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, true, FP::RoundingMode::ToNearest_TieEven);
+}
+bool TranslatorVisitor::v8_VRINTA(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+bool TranslatorVisitor::v8_VRINTZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsZero);
+}
+bool TranslatorVisitor::v8_VRINTM(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsMinusInfinity);
+}
+bool TranslatorVisitor::v8_VRINTP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
+ return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsPlusInfinity);
+}
+
+bool TranslatorVisitor::asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool half_to_single, bool M, size_t Vm) {
+ if (sz != 0b01) {
+ return UndefinedInstruction();
+ }
+ if (half_to_single && mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+ if (!half_to_single && mcl::bit::get_bit<0>(Vm)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; // StandardFPSCRValue().RMode
+ const auto d = ToVector(half_to_single, Vd, D);
+ const auto m = ToVector(!half_to_single, Vm, M);
+
+ const auto operand = ir.GetVector(m);
+ const IR::U128 result = half_to_single ? ir.FPVectorFromHalf(esize * 2, operand, rounding_mode, false)
+ : ir.FPVectorToHalf(esize * 2, operand, rounding_mode, false);
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::v8_VCVTA(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+bool TranslatorVisitor::v8_VCVTN(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::ToNearest_TieEven);
+}
+bool TranslatorVisitor::v8_VCVTP(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::TowardsPlusInfinity);
+}
+bool TranslatorVisitor::v8_VCVTM(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) {
+ return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::TowardsMinusInfinity);
+}
+
+bool TranslatorVisitor::asimd_VRECPE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b00 || sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (!F && sz == 0b01) {
+ // TODO: Implement 16-bit VectorUnsignedRecipEstimate
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = F ? ir.FPVectorRecipEstimate(esize, reg_m, false)
+ : ir.VectorUnsignedRecipEstimate(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VRSQRTE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz == 0b00 || sz == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ if (!F && sz == 0b01) {
+ // TODO: Implement 16-bit VectorUnsignedRecipEstimate
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = F ? ir.FPVectorRSqrtEstimate(esize, reg_m, false)
+ : ir.VectorUnsignedRecipSqrtEstimate(reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCVT_integer(bool D, size_t sz, size_t Vd, bool op, bool U, bool Q, bool M, size_t Vm) {
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (sz != 0b10) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto reg_m = ir.GetVector(m);
+ const auto result = op ? (U ? ir.FPVectorToUnsignedFixed(esize, reg_m, 0, FP::RoundingMode::TowardsZero, false)
+ : ir.FPVectorToSignedFixed(esize, reg_m, 0, FP::RoundingMode::TowardsZero, false))
+ : (U ? ir.FPVectorFromUnsignedFixed(esize, reg_m, 0, FP::RoundingMode::ToNearest_TieEven, false)
+ : ir.FPVectorFromSignedFixed(esize, reg_m, 0, FP::RoundingMode::ToNearest_TieEven, false));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp
new file mode 100644
index 0000000000..8d1876a05d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp
@@ -0,0 +1,188 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <utility>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+std::pair<ExtReg, size_t> GetScalarLocation(size_t esize, bool M, size_t Vm) {
+ const ExtReg m = ExtReg::Q0 + ((Vm >> 1) & (esize == 16 ? 0b11 : 0b111));
+ const size_t index = concatenate(Imm<1>{mcl::bit::get_bit<0>(Vm)}, Imm<1>{M}, Imm<1>{mcl::bit::get_bit<3>(Vm)}).ZeroExtend() >> (esize == 16 ? 0 : 1);
+ return std::make_pair(m, index);
+}
+
+enum class MultiplyBehavior {
+ Multiply,
+ MultiplyAccumulate,
+ MultiplySubtract,
+};
+
+enum class Rounding {
+ None,
+ Round,
+};
+
+bool ScalarMultiply(TranslatorVisitor& v, bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm, MultiplyBehavior multiply) {
+ if (sz == 0b11) {
+ return v.DecodeError();
+ }
+
+ if (sz == 0b00 || (F && sz == 0b01)) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn))) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto n = ToVector(Q, Vn, N);
+ const auto [m, index] = GetScalarLocation(esize, M, Vm);
+
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index);
+ const auto addend = F ? v.ir.FPVectorMul(esize, reg_n, reg_m, false)
+ : v.ir.VectorMultiply(esize, reg_n, reg_m);
+ const auto result = [&] {
+ switch (multiply) {
+ case MultiplyBehavior::Multiply:
+ return addend;
+ case MultiplyBehavior::MultiplyAccumulate:
+ return F ? v.ir.FPVectorAdd(esize, v.ir.GetVector(d), addend, false)
+ : v.ir.VectorAdd(esize, v.ir.GetVector(d), addend);
+ case MultiplyBehavior::MultiplySubtract:
+ return F ? v.ir.FPVectorSub(esize, v.ir.GetVector(d), addend, false)
+ : v.ir.VectorSub(esize, v.ir.GetVector(d), addend);
+ default:
+ return IR::U128{};
+ }
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool ScalarMultiplyLong(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, MultiplyBehavior multiply) {
+ if (sz == 0b11) {
+ return v.DecodeError();
+ }
+
+ if (sz == 0b00 || mcl::bit::get_bit<0>(Vd)) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(true, Vd, D);
+ const auto n = ToVector(false, Vn, N);
+ const auto [m, index] = GetScalarLocation(esize, M, Vm);
+
+ const auto scalar = v.ir.VectorGetElement(esize, v.ir.GetVector(m), index);
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.VectorBroadcast(esize, scalar);
+ const auto addend = U ? v.ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m)
+ : v.ir.VectorMultiplySignedWiden(esize, reg_n, reg_m);
+ const auto result = [&] {
+ switch (multiply) {
+ case MultiplyBehavior::Multiply:
+ return addend;
+ case MultiplyBehavior::MultiplyAccumulate:
+ return v.ir.VectorAdd(esize * 2, v.ir.GetVector(d), addend);
+ case MultiplyBehavior::MultiplySubtract:
+ return v.ir.VectorSub(esize * 2, v.ir.GetVector(d), addend);
+ default:
+ return IR::U128{};
+ }
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool ScalarMultiplyDoublingReturnHigh(TranslatorVisitor& v, bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, Rounding round) {
+ if (sz == 0b11) {
+ return v.DecodeError();
+ }
+
+ if (sz == 0b00) {
+ return v.UndefinedInstruction();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn))) {
+ return v.UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(Q, Vd, D);
+ const auto n = ToVector(Q, Vn, N);
+ const auto [m, index] = GetScalarLocation(esize, M, Vm);
+
+ const auto reg_n = v.ir.GetVector(n);
+ const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index);
+ const auto result = round == Rounding::None
+ ? v.ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, reg_n, reg_m)
+ : v.ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, reg_n, reg_m);
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::asimd_VMLA_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool F, bool N, bool M, size_t Vm) {
+ const auto behavior = op ? MultiplyBehavior::MultiplySubtract
+ : MultiplyBehavior::MultiplyAccumulate;
+ return ScalarMultiply(*this, Q, D, sz, Vn, Vd, F, N, M, Vm, behavior);
+}
+
+bool TranslatorVisitor::asimd_VMLAL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
+ const auto behavior = op ? MultiplyBehavior::MultiplySubtract
+ : MultiplyBehavior::MultiplyAccumulate;
+ return ScalarMultiplyLong(*this, U, D, sz, Vn, Vd, N, M, Vm, behavior);
+}
+
+bool TranslatorVisitor::asimd_VMUL_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm) {
+ return ScalarMultiply(*this, Q, D, sz, Vn, Vd, F, N, M, Vm, MultiplyBehavior::Multiply);
+}
+
+bool TranslatorVisitor::asimd_VMULL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ return ScalarMultiplyLong(*this, U, D, sz, Vn, Vd, N, M, Vm, MultiplyBehavior::Multiply);
+}
+
+bool TranslatorVisitor::asimd_VQDMULL_scalar(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ if (sz == 0b11) {
+ return DecodeError();
+ }
+
+ if (sz == 0b00 || mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t esize = 8U << sz;
+ const auto d = ToVector(true, Vd, D);
+ const auto n = ToVector(false, Vn, N);
+ const auto [m, index] = GetScalarLocation(esize, M, Vm);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto reg_m = ir.VectorBroadcastElement(esize, ir.GetVector(m), index);
+ const auto result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, reg_n, reg_m);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ return ScalarMultiplyDoublingReturnHigh(*this, Q, D, sz, Vn, Vd, N, M, Vm, Rounding::None);
+}
+
+bool TranslatorVisitor::asimd_VQRDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) {
+ return ScalarMultiplyDoublingReturnHigh(*this, Q, D, sz, Vn, Vd, N, M, Vm, Rounding::Round);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp
new file mode 100644
index 0000000000..b7300f91e4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp
@@ -0,0 +1,349 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+enum class Accumulating {
+ None,
+ Accumulate
+};
+
+enum class Rounding {
+ None,
+ Round,
+};
+
+enum class Narrowing {
+ Truncation,
+ SaturateToUnsigned,
+ SaturateToSigned,
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+IR::U128 PerformRoundingCorrection(TranslatorVisitor& v, size_t esize, u64 round_value, IR::U128 original, IR::U128 shifted) {
+ const auto round_const = v.ir.VectorBroadcast(esize, v.I(esize, round_value));
+ const auto round_correction = v.ir.VectorEqual(esize, v.ir.VectorAnd(original, round_const), round_const);
+ return v.ir.VectorSub(esize, shifted, round_correction);
+}
+
+std::pair<size_t, size_t> ElementSizeAndShiftAmount(bool right_shift, bool L, size_t imm6) {
+ if (right_shift) {
+ if (L) {
+ return {64, 64 - imm6};
+ }
+
+ const size_t esize = 8U << mcl::bit::highest_set_bit(imm6 >> 3);
+ const size_t shift_amount = (esize * 2) - imm6;
+ return {esize, shift_amount};
+ } else {
+ if (L) {
+ return {64, imm6};
+ }
+
+ const size_t esize = 8U << mcl::bit::highest_set_bit(imm6 >> 3);
+ const size_t shift_amount = imm6 - esize;
+ return {esize, shift_amount};
+ }
+}
+
+bool ShiftRight(TranslatorVisitor& v, bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm, Accumulating accumulate, Rounding rounding) {
+ if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return v.DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(true, L, imm6);
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = v.ir.GetVector(m);
+ auto result = U ? v.ir.VectorLogicalShiftRight(esize, reg_m, static_cast<u8>(shift_amount))
+ : v.ir.VectorArithmeticShiftRight(esize, reg_m, static_cast<u8>(shift_amount));
+
+ if (rounding == Rounding::Round) {
+ const u64 round_value = 1ULL << (shift_amount - 1);
+ result = PerformRoundingCorrection(v, esize, round_value, reg_m, result);
+ }
+
+ if (accumulate == Accumulating::Accumulate) {
+ const auto reg_d = v.ir.GetVector(d);
+ result = v.ir.VectorAdd(esize, result, reg_d);
+ }
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+
+bool ShiftRightNarrowing(TranslatorVisitor& v, bool D, size_t imm6, size_t Vd, bool M, size_t Vm, Rounding rounding, Narrowing narrowing, Signedness signedness) {
+ if (mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return v.DecodeError();
+ }
+
+ if (mcl::bit::get_bit<0>(Vm)) {
+ return v.UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount_] = ElementSizeAndShiftAmount(true, false, imm6);
+ const auto source_esize = 2 * esize;
+ const auto shift_amount = static_cast<u8>(shift_amount_);
+
+ const auto d = ToVector(false, Vd, D);
+ const auto m = ToVector(true, Vm, M);
+
+ const auto reg_m = v.ir.GetVector(m);
+ auto wide_result = [&] {
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorArithmeticShiftRight(source_esize, reg_m, shift_amount);
+ }
+ return v.ir.VectorLogicalShiftRight(source_esize, reg_m, shift_amount);
+ }();
+
+ if (rounding == Rounding::Round) {
+ const u64 round_value = 1ULL << (shift_amount - 1);
+ wide_result = PerformRoundingCorrection(v, source_esize, round_value, reg_m, wide_result);
+ }
+
+ const auto result = [&] {
+ switch (narrowing) {
+ case Narrowing::Truncation:
+ return v.ir.VectorNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToUnsigned:
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result);
+ }
+ return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToSigned:
+ ASSERT(signedness == Signedness::Signed);
+ return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result);
+ }
+ UNREACHABLE();
+ }();
+
+ v.ir.SetVector(d, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm,
+ Accumulating::None, Rounding::None);
+}
+
+bool TranslatorVisitor::asimd_SRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm,
+ Accumulating::Accumulate, Rounding::None);
+}
+
+bool TranslatorVisitor::asimd_VRSHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm,
+ Accumulating::None, Rounding::Round);
+}
+
+bool TranslatorVisitor::asimd_VRSRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm,
+ Accumulating::Accumulate, Rounding::Round);
+}
+
+bool TranslatorVisitor::asimd_VSRI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(true, L, imm6);
+ const u64 mask = shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_d = ir.GetVector(d);
+
+ const auto shifted = ir.VectorLogicalShiftRight(esize, reg_m, static_cast<u8>(shift_amount));
+ const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
+ const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSLI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6);
+ const u64 mask = mcl::bit::ones<u64>(esize) << shift_amount;
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto reg_d = ir.GetVector(d);
+
+ const auto shifted = ir.VectorLogicalShiftLeft(esize, reg_m, static_cast<u8>(shift_amount));
+ const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
+ const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VQSHL(bool U, bool D, size_t imm6, size_t Vd, bool op, bool L, bool Q, bool M, size_t Vm) {
+ if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (!U && !op) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+ const auto result = [&] {
+ const auto reg_m = ir.GetVector(m);
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6);
+ const IR::U128 shift_vec = ir.VectorBroadcast(esize, I(esize, shift_amount));
+
+ if (U) {
+ if (op) {
+ return ir.VectorUnsignedSaturatedShiftLeft(esize, reg_m, shift_vec);
+ }
+
+ return ir.VectorSignedSaturatedShiftLeftUnsigned(esize, reg_m, static_cast<u8>(shift_amount));
+ }
+ if (op) {
+ return ir.VectorSignedSaturatedShiftLeft(esize, reg_m, shift_vec);
+ }
+
+ return IR::U128{};
+ }();
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSHL(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) {
+ if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6);
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = ir.VectorLogicalShiftLeft(esize, reg_m, static_cast<u8>(shift_amount));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::None, Narrowing::Truncation, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::asimd_VRSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::Round, Narrowing::Truncation, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::asimd_VQRSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::asimd_VQSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::asimd_VQSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::None, U ? Narrowing::SaturateToUnsigned : Narrowing::SaturateToSigned, U ? Signedness::Unsigned : Signedness::Signed);
+}
+
+bool TranslatorVisitor::asimd_VQRSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm,
+ Rounding::Round, U ? Narrowing::SaturateToUnsigned : Narrowing::SaturateToSigned, U ? Signedness::Unsigned : Signedness::Signed);
+}
+
+bool TranslatorVisitor::asimd_VSHLL(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) {
+ if (mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+
+ const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, false, imm6);
+
+ const auto d = ToVector(true, Vd, D);
+ const auto m = ToVector(false, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto ext_vec = U ? ir.VectorZeroExtend(esize, reg_m) : ir.VectorSignExtend(esize, reg_m);
+ const auto result = ir.VectorLogicalShiftLeft(esize * 2, ext_vec, static_cast<u8>(shift_amount));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::asimd_VCVT_fixed(bool U, bool D, size_t imm6, size_t Vd, bool to_fixed, bool Q, bool M, size_t Vm) {
+ if (mcl::bit::get_bits<3, 5>(imm6) == 0) {
+ return DecodeError();
+ }
+
+ if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) {
+ return UndefinedInstruction();
+ }
+
+ if (!mcl::bit::get_bit<5>(imm6)) {
+ return UndefinedInstruction();
+ }
+
+ const size_t fbits = 64 - imm6;
+ const auto d = ToVector(Q, Vd, D);
+ const auto m = ToVector(Q, Vm, M);
+
+ const auto reg_m = ir.GetVector(m);
+ const auto result = to_fixed ? (U ? ir.FPVectorToUnsignedFixed(32, reg_m, fbits, FP::RoundingMode::TowardsZero, false) : ir.FPVectorToSignedFixed(32, reg_m, fbits, FP::RoundingMode::TowardsZero, false))
+ : (U ? ir.FPVectorFromUnsignedFixed(32, reg_m, fbits, FP::RoundingMode::ToNearest_TieEven, false) : ir.FPVectorFromSignedFixed(32, reg_m, fbits, FP::RoundingMode::ToNearest_TieEven, false));
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp
new file mode 100644
index 0000000000..7d9e7459f1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp
@@ -0,0 +1,27 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::arm_DMB(Imm<4> /*option*/) {
+ ir.DataMemoryBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::arm_DSB(Imm<4> /*option*/) {
+ ir.DataSynchronizationBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::arm_ISB(Imm<4> /*option*/) {
+ ir.InstructionSynchronizationBarrier();
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp
new file mode 100644
index 0000000000..29748a6eba
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp
@@ -0,0 +1,166 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// CDP{2} <coproc_no>, #<opc1>, <CRd>, <CRn>, <CRm>, #<opc2>
+bool TranslatorVisitor::arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ ir.CoprocInternalOperation(coproc_no, two, opc1, CRd, CRn, CRm, opc2);
+ }
+ return true;
+}
+
+// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>, #+/-<imm32>]{!}
+// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>], #+/-<imm32>
+// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>], <imm8>
+bool TranslatorVisitor::arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ if (!p && !u && !d && !w) {
+ return arm_UDF();
+ }
+
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const bool index = p;
+ const bool add = u;
+ const bool wback = w;
+ const bool has_option = !p && !w && u;
+ const IR::U32 reg_n = ir.GetRegister(n);
+ const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32));
+ const IR::U32 address = index ? offset_address : reg_n;
+ ir.CoprocLoadWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>());
+ if (wback) {
+ ir.SetRegister(n, offset_address);
+ }
+ }
+ return true;
+}
+
+// MCR{2}<c> <coproc_no>, #<opc1>, <Rt>, <CRn>, <CRm>, #<opc2>
+bool TranslatorVisitor::arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ ir.CoprocSendOneWord(coproc_no, two, opc1, CRn, CRm, opc2, ir.GetRegister(t));
+ }
+ return true;
+}
+
+// MCRR{2}<c> <coproc_no>, #<opc>, <Rt>, <Rt2>, <CRm>
+bool TranslatorVisitor::arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ if (t == Reg::PC || t2 == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ ir.CoprocSendTwoWords(coproc_no, two, opc, CRm, ir.GetRegister(t), ir.GetRegister(t2));
+ }
+ return true;
+}
+
+// MRC{2}<c> <coproc_no>, #<opc1>, <Rt>, <CRn>, <CRm>, #<opc2>
+bool TranslatorVisitor::arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ const auto word = ir.CoprocGetOneWord(coproc_no, two, opc1, CRn, CRm, opc2);
+ if (t != Reg::PC) {
+ ir.SetRegister(t, word);
+ } else {
+ const auto new_cpsr_nzcv = ir.And(word, ir.Imm32(0xF0000000));
+ ir.SetCpsrNZCVRaw(new_cpsr_nzcv);
+ }
+ }
+ return true;
+}
+
+// MRRC{2}<c> <coproc_no>, #<opc>, <Rt>, <Rt2>, <CRm>
+bool TranslatorVisitor::arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ if (t == Reg::PC || t2 == Reg::PC || t == t2) {
+ return UnpredictableInstruction();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ const auto two_words = ir.CoprocGetTwoWords(coproc_no, two, opc, CRm);
+ ir.SetRegister(t, ir.LeastSignificantWord(two_words));
+ ir.SetRegister(t2, ir.MostSignificantWord(two_words).result);
+ }
+ return true;
+}
+
+// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>, #+/-<imm32>]{!}
+// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>], #+/-<imm32>
+// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>], <imm8>
+bool TranslatorVisitor::arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ if ((coproc_no & 0b1110) == 0b1010) {
+ return arm_UDF();
+ }
+
+ if (!p && !u && !d && !w) {
+ return arm_UDF();
+ }
+
+ if (n == Reg::PC && w) {
+ return UnpredictableInstruction();
+ }
+
+ const bool two = cond == Cond::NV;
+
+ if (two || ArmConditionPassed(cond)) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const bool index = p;
+ const bool add = u;
+ const bool wback = w;
+ const bool has_option = !p && !w && u;
+ const IR::U32 reg_n = ir.GetRegister(n);
+ const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32));
+ const IR::U32 address = index ? offset_address : reg_n;
+ ir.CoprocStoreWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>());
+ if (wback) {
+ ir.SetRegister(n, offset_address);
+ }
+ }
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp
new file mode 100644
index 0000000000..9d7ea6a170
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp
@@ -0,0 +1,1116 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// ADC{S}<c> <Rd>, <Rn>, #<imm>
+bool TranslatorVisitor::arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// ADC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// ADC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// ADD{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// ADD{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// ADD{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// AND{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// AND{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// AND{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// BIC{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// BIC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// BIC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// CMN<c> <Rn>, #<const>
+bool TranslatorVisitor::arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMN<c> <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMN<c> <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMP<c> <Rn>, #<imm>
+bool TranslatorVisitor::arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMP<c> <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMP<c> <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// EOR{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// EOR{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// EOR{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// MOV{S}<c> <Rd>, #<const>
+bool TranslatorVisitor::arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.Imm32(imm_carry.imm32);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// MOV{S}<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = shifted.result;
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = shifted.result;
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// MVN{S}<c> <Rd>, #<const>
+bool TranslatorVisitor::arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.Not(ir.Imm32(imm_carry.imm32));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// MVN{S}<c> <Rd>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.Not(shifted.result);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// MVN{S}<c> <Rd>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (d == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.Not(shifted.result);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// ORR{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+
+ return true;
+}
+
+// ORR{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.Or(ir.GetRegister(n), shifted.result);
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// ORR{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.Or(ir.GetRegister(n), shifted.result);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+
+ return true;
+}
+
+// RSB{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.Imm1(1));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// RSB{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// RSB{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// RSC{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// RSC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.GetCFlag());
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// SBC{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// SBC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// SBC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// SUB{S}<c> <Rd>, <Rn>, #<const>
+bool TranslatorVisitor::arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// SUB{S}<c> <Rd>, <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+ if (d == Reg::PC) {
+ if (S) {
+ // This is UNPREDICTABLE when in user-mode.
+ return UnpredictableInstruction();
+ }
+
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+ }
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+
+ return true;
+}
+
+// TEQ<c> <Rn>, #<const>
+bool TranslatorVisitor::arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ return true;
+}
+
+// TEQ<c> <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+// TEQ<c> <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+// TST<c> <Rn>, #<const>
+bool TranslatorVisitor::arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ return true;
+}
+
+// TST<c> <Rn>, <Rm>{, <shift>}
+bool TranslatorVisitor::arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+// TST<c> <Rn>, <Rm>, <type> <Rs>
+bool TranslatorVisitor::arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) {
+ if (n == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
+ const auto carry_in = ir.GetCFlag();
+ const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp
new file mode 100644
index 0000000000..f13a257d21
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp
@@ -0,0 +1,40 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+using DivideFunction = IR::U32U64 (IREmitter::*)(const IR::U32U64&, const IR::U32U64&);
+
+bool DivideOperation(TranslatorVisitor& v, Cond cond, Reg d, Reg m, Reg n, DivideFunction fn) {
+ if (d == Reg::PC || m == Reg::PC || n == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ if (!v.ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 operand1 = v.ir.GetRegister(n);
+ const IR::U32 operand2 = v.ir.GetRegister(m);
+ const IR::U32 result = (v.ir.*fn)(operand1, operand2);
+
+ v.ir.SetRegister(d, result);
+ return true;
+}
+} // Anonymous namespace
+
+// SDIV<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SDIV(Cond cond, Reg d, Reg m, Reg n) {
+ return DivideOperation(*this, cond, d, m, n, &IREmitter::SignedDiv);
+}
+
+// UDIV<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UDIV(Cond cond, Reg d, Reg m, Reg n) {
+ return DivideOperation(*this, cond, d, m, n, &IREmitter::UnsignedDiv);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp
new file mode 100644
index 0000000000..3ee6a670f0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp
@@ -0,0 +1,229 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+static IR::U32 Rotate(A32::IREmitter& ir, Reg m, SignExtendRotation rotate) {
+ const u8 rotate_by = static_cast<u8>(static_cast<size_t>(rotate) * 8);
+ return ir.RotateRight(ir.GetRegister(m), ir.Imm8(rotate_by), ir.Imm1(0)).result;
+}
+
+// SXTAB<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SXTAB16<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080));
+ const auto addend = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE)));
+ const auto result = ir.PackedAddU16(addend, ir.GetRegister(n)).result;
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SXTAH<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SXTB<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SXTB16<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080));
+ const auto result = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SXTH<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTAB<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTAB16<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ auto result = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto reg_n = ir.GetRegister(n);
+ result = ir.PackedAddU16(reg_n, result).result;
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTAH<c> <Rd>, <Rn>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTB<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTB16<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.And(rotated, ir.Imm32(0x00FF00FF));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UXTH<c> <Rd>, <Rm>{, <rotation>}
+bool TranslatorVisitor::arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp
new file mode 100644
index 0000000000..0ec6aeff9a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp
@@ -0,0 +1,69 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::arm_PLD_imm(bool /*add*/, bool R, Reg /*n*/, Imm<12> /*imm12*/) {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ const auto exception = R ? Exception::PreloadData : Exception::PreloadDataWithIntentToWrite;
+ return RaiseException(exception);
+}
+
+bool TranslatorVisitor::arm_PLD_reg(bool /*add*/, bool R, Reg /*n*/, Imm<5> /*imm5*/, ShiftType /*shift*/, Reg /*m*/) {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ const auto exception = R ? Exception::PreloadData : Exception::PreloadDataWithIntentToWrite;
+ return RaiseException(exception);
+}
+
+bool TranslatorVisitor::arm_SEV() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ return RaiseException(Exception::SendEvent);
+}
+
+bool TranslatorVisitor::arm_SEVL() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ return RaiseException(Exception::SendEventLocal);
+}
+
+bool TranslatorVisitor::arm_WFE() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ return RaiseException(Exception::WaitForEvent);
+}
+
+bool TranslatorVisitor::arm_WFI() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ return RaiseException(Exception::WaitForInterrupt);
+}
+
+bool TranslatorVisitor::arm_YIELD() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+
+ return RaiseException(Exception::Yield);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp
new file mode 100644
index 0000000000..7ef8b7e890
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp
@@ -0,0 +1,957 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::arm_LDRBT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_LDRHT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_LDRSBT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_LDRSHT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_LDRT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_STRBT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_STRHT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+bool TranslatorVisitor::arm_STRT() {
+ // System instructions unimplemented
+ return UndefinedInstruction();
+}
+
+static IR::U32 GetAddress(A32::IREmitter& ir, bool P, bool U, bool W, Reg n, IR::U32 offset) {
+ const bool index = P;
+ const bool add = U;
+ const bool wback = !P || W;
+
+ const IR::U32 offset_addr = add ? ir.Add(ir.GetRegister(n), offset) : ir.Sub(ir.GetRegister(n), offset);
+ const IR::U32 address = index ? offset_addr : ir.GetRegister(n);
+
+ if (wback) {
+ ir.SetRegister(n, offset_addr);
+ }
+
+ return address;
+}
+
+// LDR <Rt>, [PC, #+/-<imm>]
+bool TranslatorVisitor::arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12) {
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const bool add = U;
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm12.ZeroExtend()) : (base - imm12.ZeroExtend());
+ const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.LoadWritePC(data);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDR <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDR <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if ((!P || W) && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm12.ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.LoadWritePC(data);
+
+ if (!P && W && n == Reg::R13) {
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ } else {
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ }
+
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDR <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDR <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result;
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.LoadWritePC(data);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRB <Rt>, [PC, #+/-<imm>]
+bool TranslatorVisitor::arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm12.ZeroExtend();
+ const bool add = U;
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm32) : (base - imm32);
+ const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(ir.Imm32(address), IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRB <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDRB <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if ((!P || W) && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm12.ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRB <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDRB <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result;
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRD <Rt>, <Rt2>, [PC, #+/-<imm>]
+bool TranslatorVisitor::arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (RegNumber(t) % 2 == 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (t + 1 == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const Reg t2 = t + 1;
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const bool add = U;
+
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm32) : (base - imm32);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ const IR::U64 data = ir.ReadMemory64(ir.Imm32(address), IR::AccType::ATOMIC);
+
+ if (ir.current_location.EFlag()) {
+ ir.SetRegister(t, ir.MostSignificantWord(data).result);
+ ir.SetRegister(t2, ir.LeastSignificantWord(data));
+ } else {
+ ir.SetRegister(t, ir.LeastSignificantWord(data));
+ ir.SetRegister(t2, ir.MostSignificantWord(data).result);
+ }
+ return true;
+}
+
+// LDRD <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDRD <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (RegNumber(t) % 2 == 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!P && W) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == t || n == t + 1)) {
+ return UnpredictableInstruction();
+ }
+
+ if (t + 1 == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const Reg t2 = t + 1;
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ const IR::U64 data = ir.ReadMemory64(address, IR::AccType::ATOMIC);
+
+ if (ir.current_location.EFlag()) {
+ ir.SetRegister(t, ir.MostSignificantWord(data).result);
+ ir.SetRegister(t2, ir.LeastSignificantWord(data));
+ } else {
+ ir.SetRegister(t, ir.LeastSignificantWord(data));
+ ir.SetRegister(t2, ir.MostSignificantWord(data).result);
+ }
+ return true;
+}
+
+// LDRD <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDRD <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ if (RegNumber(t) % 2 == 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!P && W) {
+ return UnpredictableInstruction();
+ }
+
+ if (t + 1 == Reg::PC || m == Reg::PC || m == t || m == t + 1) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t || n == t + 1)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const Reg t2 = t + 1;
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ const IR::U64 data = ir.ReadMemory64(address, IR::AccType::ATOMIC);
+
+ if (ir.current_location.EFlag()) {
+ ir.SetRegister(t, ir.MostSignificantWord(data).result);
+ ir.SetRegister(t2, ir.LeastSignificantWord(data));
+ } else {
+ ir.SetRegister(t, ir.LeastSignificantWord(data));
+ ir.SetRegister(t2, ir.MostSignificantWord(data).result);
+ }
+ return true;
+}
+
+// LDRH <Rt>, [PC, #-/+<imm>]
+bool TranslatorVisitor::arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (P == W) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const bool add = U;
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm32) : (base - imm32);
+ const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(ir.Imm32(address), IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRH <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDRH <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if ((!P || W) && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRH <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDRH <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSB <Rt>, [PC, #+/-<imm>]
+bool TranslatorVisitor::arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const bool add = U;
+
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm32) : (base - imm32);
+ const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(ir.Imm32(address), IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSB <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDRSB <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if ((!P || W) && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSB <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDRSB <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSH <Rt>, [PC, #-/+<imm>]
+bool TranslatorVisitor::arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const bool add = U;
+ const u32 base = ir.AlignPC(4);
+ const u32 address = add ? (base + imm32) : (base - imm32);
+ const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(ir.Imm32(address), IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSH <Rt>, [<Rn>, #+/-<imm>]{!}
+// LDRSH <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if ((!P || W) && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRSH <Rt>, [<Rn>, #+/-<Rm>]{!}
+// LDRSH <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ ASSERT_MSG(!(!P && W), "T form of instruction unimplemented");
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STR <Rt>, [<Rn>, #+/-<imm>]{!}
+// STR <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.Imm32(imm12.ZeroExtend());
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::NORMAL);
+ return true;
+}
+
+// STR <Rt>, [<Rn>, #+/-<Rm>]{!}
+// STR <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result;
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::NORMAL);
+ return true;
+}
+
+// STRB <Rt>, [<Rn>, #+/-<imm>]{!}
+// STRB <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.Imm32(imm12.ZeroExtend());
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::NORMAL);
+ return true;
+}
+
+// STRB <Rt>, [<Rn>, #+/-<Rm>]{!}
+// STRB <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) {
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result;
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::NORMAL);
+ return true;
+}
+
+// STRD <Rt>, [<Rn>, #+/-<imm>]{!}
+// STRD <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (size_t(t) % 2 != 0) {
+ return UnpredictableInstruction();
+ }
+
+ if (!P && W) {
+ return UnpredictableInstruction();
+ }
+
+ const Reg t2 = t + 1;
+ if ((!P || W) && (n == Reg::PC || n == t || n == t2)) {
+ return UnpredictableInstruction();
+ }
+
+ if (t2 == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto value_a = ir.GetRegister(t);
+ const auto value_b = ir.GetRegister(t2);
+
+ const IR::U64 data = ir.current_location.EFlag() ? ir.Pack2x32To1x64(value_b, value_a)
+ : ir.Pack2x32To1x64(value_a, value_b);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ ir.WriteMemory64(address, data, IR::AccType::ATOMIC);
+ return true;
+}
+
+// STRD <Rt>, [<Rn>, #+/-<Rm>]{!}
+// STRD <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ if (size_t(t) % 2 != 0) {
+ return UnpredictableInstruction();
+ }
+
+ if (!P && W) {
+ return UnpredictableInstruction();
+ }
+
+ const Reg t2 = t + 1;
+ if (t2 == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t || n == t2)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+ const auto value_a = ir.GetRegister(t);
+ const auto value_b = ir.GetRegister(t2);
+
+ const IR::U64 data = ir.current_location.EFlag() ? ir.Pack2x32To1x64(value_b, value_a)
+ : ir.Pack2x32To1x64(value_a, value_b);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ ir.WriteMemory64(address, data, IR::AccType::ATOMIC);
+ return true;
+}
+
+// STRH <Rt>, [<Rn>, #+/-<imm>]{!}
+// STRH <Rt>, [<Rn>], #+/-<imm>
+bool TranslatorVisitor::arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend();
+ const auto offset = ir.Imm32(imm32);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+
+ ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::NORMAL);
+ return true;
+}
+
+// STRH <Rt>, [<Rn>, #+/-<Rm>]{!}
+// STRH <Rt>, [<Rn>], #+/-<Rm>
+bool TranslatorVisitor::arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) {
+ if (t == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if ((!P || W) && (n == Reg::PC || n == t)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto offset = ir.GetRegister(m);
+ const auto address = GetAddress(ir, P, U, W, n, offset);
+
+ ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::NORMAL);
+ return true;
+}
+
+static bool LDMHelper(A32::IREmitter& ir, bool W, Reg n, RegList list, IR::U32 start_address, IR::U32 writeback_address) {
+ auto address = start_address;
+ for (size_t i = 0; i <= 14; i++) {
+ if (mcl::bit::get_bit(i, list)) {
+ ir.SetRegister(static_cast<Reg>(i), ir.ReadMemory32(address, IR::AccType::ATOMIC));
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+ if (W && !mcl::bit::get_bit(RegNumber(n), list)) {
+ ir.SetRegister(n, writeback_address);
+ }
+ if (mcl::bit::get_bit<15>(list)) {
+ ir.LoadWritePC(ir.ReadMemory32(address, IR::AccType::ATOMIC));
+ if (n == Reg::R13)
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ else
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+ return true;
+}
+
+// LDM <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_LDM(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.GetRegister(n);
+ const auto writeback_address = ir.Add(start_address, ir.Imm32(u32(mcl::bit::count_ones(list) * 4)));
+ return LDMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// LDMDA <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_LDMDA(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list) - 4)));
+ const auto writeback_address = ir.Sub(start_address, ir.Imm32(4));
+ return LDMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// LDMDB <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_LDMDB(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list))));
+ const auto writeback_address = start_address;
+ return LDMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// LDMIB <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_LDMIB(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Add(ir.GetRegister(n), ir.Imm32(4));
+ const auto writeback_address = ir.Add(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list))));
+ return LDMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+bool TranslatorVisitor::arm_LDM_usr() {
+ return InterpretThisInstruction();
+}
+
+bool TranslatorVisitor::arm_LDM_eret() {
+ return InterpretThisInstruction();
+}
+
+static bool STMHelper(A32::IREmitter& ir, bool W, Reg n, RegList list, IR::U32 start_address, IR::U32 writeback_address) {
+ auto address = start_address;
+ for (size_t i = 0; i <= 14; i++) {
+ if (mcl::bit::get_bit(i, list)) {
+ ir.WriteMemory32(address, ir.GetRegister(static_cast<Reg>(i)), IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+ if (W) {
+ ir.SetRegister(n, writeback_address);
+ }
+ if (mcl::bit::get_bit<15>(list)) {
+ ir.WriteMemory32(address, ir.Imm32(ir.PC()), IR::AccType::ATOMIC);
+ }
+ return true;
+}
+
+// STM <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_STM(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.GetRegister(n);
+ const auto writeback_address = ir.Add(start_address, ir.Imm32(u32(mcl::bit::count_ones(list) * 4)));
+ return STMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// STMDA <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_STMDA(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list) - 4)));
+ const auto writeback_address = ir.Sub(start_address, ir.Imm32(4));
+ return STMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// STMDB <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_STMDB(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list))));
+ const auto writeback_address = start_address;
+ return STMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+// STMIB <Rn>{!}, <reg_list>
+bool TranslatorVisitor::arm_STMIB(Cond cond, bool W, Reg n, RegList list) {
+ if (n == Reg::PC || mcl::bit::count_ones(list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto start_address = ir.Add(ir.GetRegister(n), ir.Imm32(4));
+ const auto writeback_address = ir.Add(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list))));
+ return STMHelper(ir, W, n, list, start_address, writeback_address);
+}
+
+bool TranslatorVisitor::arm_STM_usr() {
+ return InterpretThisInstruction();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp
new file mode 100644
index 0000000000..ef54b66827
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp
@@ -0,0 +1,179 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bitsizeof.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// BFC<c> <Rd>, #<lsb>, #<width>
+bool TranslatorVisitor::arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (msb < lsb) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 msb_value = msb.ZeroExtend();
+ const u32 mask = ~(mcl::bit::ones<u32>(msb_value - lsb_value + 1) << lsb_value);
+ const IR::U32 operand = ir.GetRegister(d);
+ const IR::U32 result = ir.And(operand, ir.Imm32(mask));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// BFI<c> <Rd>, <Rn>, #<lsb>, #<width>
+bool TranslatorVisitor::arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (msb < lsb) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 msb_value = msb.ZeroExtend();
+ const u32 inclusion_mask = mcl::bit::ones<u32>(msb_value - lsb_value + 1) << lsb_value;
+ const u32 exclusion_mask = ~inclusion_mask;
+ const IR::U32 operand1 = ir.And(ir.GetRegister(d), ir.Imm32(exclusion_mask));
+ const IR::U32 operand2 = ir.And(ir.LogicalShiftLeft(ir.GetRegister(n), ir.Imm8(u8(lsb_value))), ir.Imm32(inclusion_mask));
+ const IR::U32 result = ir.Or(operand1, operand2);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// CLZ<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_CLZ(Cond cond, Reg d, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.SetRegister(d, ir.CountLeadingZeros(ir.GetRegister(m)));
+ return true;
+}
+
+// MOVT<c> <Rd>, #<imm16>
+bool TranslatorVisitor::arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 imm16 = ir.Imm32(concatenate(imm4, imm12).ZeroExtend() << 16);
+ const IR::U32 operand = ir.GetRegister(d);
+ const IR::U32 result = ir.Or(ir.And(operand, ir.Imm32(0x0000FFFFU)), imm16);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 imm = ir.Imm32(concatenate(imm4, imm12).ZeroExtend());
+
+ ir.SetRegister(d, imm);
+ return true;
+}
+
+// SBFX<c> <Rd>, <Rn>, #<lsb>, #<width>
+bool TranslatorVisitor::arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 widthm1_value = widthm1.ZeroExtend();
+ const u32 msb = lsb_value + widthm1_value;
+ if (msb >= mcl::bitsizeof<u32>) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ constexpr size_t max_width = mcl::bitsizeof<u32>;
+ const u32 width = widthm1_value + 1;
+ const u8 left_shift_amount = static_cast<u8>(max_width - width - lsb_value);
+ const u8 right_shift_amount = static_cast<u8>(max_width - width);
+ const IR::U32 operand = ir.GetRegister(n);
+ const IR::U32 tmp = ir.LogicalShiftLeft(operand, ir.Imm8(left_shift_amount));
+ const IR::U32 result = ir.ArithmeticShiftRight(tmp, ir.Imm8(right_shift_amount));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SEL<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SEL(Cond cond, Reg n, Reg d, Reg m) {
+ if (n == Reg::PC || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto to = ir.GetRegister(m);
+ const auto from = ir.GetRegister(n);
+ const auto result = ir.PackedSelect(ir.GetGEFlags(), to, from);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UBFX<c> <Rd>, <Rn>, #<lsb>, #<width>
+bool TranslatorVisitor::arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsb_value = lsb.ZeroExtend();
+ const u32 widthm1_value = widthm1.ZeroExtend();
+ const u32 msb = lsb_value + widthm1_value;
+ if (msb >= mcl::bitsizeof<u32>) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 operand = ir.GetRegister(n);
+ const IR::U32 mask = ir.Imm32(mcl::bit::ones<u32>(widthm1_value + 1));
+ const IR::U32 result = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(u8(lsb_value))), mask);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp
new file mode 100644
index 0000000000..2bda56a961
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp
@@ -0,0 +1,604 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// MLA{S}<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.Add(ir.Mul(ir.GetRegister(n), ir.GetRegister(m)), ir.GetRegister(a));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// MLS<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n) {
+ if (d == Reg::PC || a == Reg::PC || m == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 operand1 = ir.GetRegister(n);
+ const IR::U32 operand2 = ir.GetRegister(m);
+ const IR::U32 operand3 = ir.GetRegister(a);
+ const IR::U32 result = ir.Sub(operand3, ir.Mul(operand1, operand2));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// MUL{S}<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.Mul(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// SMLAL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto product = ir.Mul(n64, m64);
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(product, addend);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// SMULL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Mul(n64, m64);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// UMAAL<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto lo64 = ir.ZeroExtendWordToLong(ir.GetRegister(dLo));
+ const auto hi64 = ir.ZeroExtendWordToLong(ir.GetRegister(dHi));
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Add(ir.Add(ir.Mul(n64, m64), hi64), lo64);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+// UMLAL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Add(ir.Mul(n64, m64), addend);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// UMULL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Mul(n64, m64);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ if (S) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+
+ return true;
+}
+
+// SMLAL<x><y><c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const IR::U64 product = ir.SignExtendWordToLong(ir.Mul(n16, m16));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(product, addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+// SMLA<x><y><c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const IR::U32 product = ir.Mul(n16, m16);
+ const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+// SMUL<x><y><c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const IR::U32 result = ir.Mul(n16, m16);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SMLAW<y><c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ IR::U32 m32 = ir.GetRegister(m);
+ if (M) {
+ m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ }
+ const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)));
+ const auto product = ir.LeastSignificantWord(ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16)));
+ const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+// SMULW<y><c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ IR::U32 m32 = ir.GetRegister(m);
+ if (M) {
+ m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ }
+ const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)));
+ const auto result = ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16));
+
+ ir.SetRegister(d, ir.LeastSignificantWord(result));
+ return true;
+}
+
+// SMMLA{R}<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC /* no check for a */) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a));
+ const auto temp = ir.Add(a64, ir.Mul(n64, m64));
+ const auto result_carry = ir.MostSignificantWord(temp);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SMMLS{R}<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a));
+ const auto temp = ir.Sub(a64, ir.Mul(n64, m64));
+ const auto result_carry = ir.MostSignificantWord(temp);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SMMUL{R}<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto product = ir.Mul(n64, m64);
+ const auto result_carry = ir.MostSignificantWord(product);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SMLAD{X}<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ if (a == Reg::PC) {
+ return arm_SMUAD(cond, d, m, M, n);
+ }
+
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const IR::U32 addend = ir.GetRegister(a);
+
+ auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0));
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ result = ir.AddWithCarry(result, addend, ir.Imm1(0));
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+// SMLALD{X}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo));
+ const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(ir.Add(product_lo, product_hi), addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+// SMLSD{X}<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) {
+ if (a == Reg::PC) {
+ return arm_SMUSD(cond, d, m, M, n);
+ }
+
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const IR::U32 addend = ir.GetRegister(a);
+ const IR::U32 product = ir.Sub(product_lo, product_hi);
+ auto result = ir.AddWithCarry(product, addend, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+// SMLSLD{X}<c> <RdLo>, <RdHi>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dLo == dHi) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo));
+ const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(ir.Sub(product_lo, product_hi), addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+// SMUAD{X}<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+// SMUSD{X}<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ auto result = ir.Sub(product_lo, product_hi);
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp
new file mode 100644
index 0000000000..5cb88fe47d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp
@@ -0,0 +1,46 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// PKHBT<c> <Rd>, <Rn>, <Rm>{, LSL #<imm>}
+bool TranslatorVisitor::arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) {
+ if (n == Reg::PC || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), ShiftType::LSL, imm5, ir.Imm1(false)).result;
+ const auto lower_half = ir.And(ir.GetRegister(n), ir.Imm32(0x0000FFFF));
+ const auto upper_half = ir.And(shifted, ir.Imm32(0xFFFF0000));
+
+ ir.SetRegister(d, ir.Or(lower_half, upper_half));
+ return true;
+}
+
+// PKHTB<c> <Rd>, <Rn>, <Rm>{, ASR #<imm>}
+bool TranslatorVisitor::arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) {
+ if (n == Reg::PC || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), ShiftType::ASR, imm5, ir.Imm1(false)).result;
+ const auto lower_half = ir.And(shifted, ir.Imm32(0x0000FFFF));
+ const auto upper_half = ir.And(ir.GetRegister(n), ir.Imm32(0xFFFF0000));
+
+ ir.SetRegister(d, ir.Or(lower_half, upper_half));
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp
new file mode 100644
index 0000000000..666cee2c33
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp
@@ -0,0 +1,537 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// SADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// SADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// SASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// SSAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SSAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// SSUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// SSUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// UADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// UADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// UASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// USAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// USAD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_USAD8(Cond cond, Reg d, Reg m, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedAbsDiffSumU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// USADA8<c> <Rd>, <Rn>, <Rm>, <Ra>
+bool TranslatorVisitor::arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto tmp = ir.PackedAbsDiffSumU8(ir.GetRegister(n), ir.GetRegister(m));
+ const auto result = ir.AddWithCarry(ir.GetRegister(a), tmp, ir.Imm1(0));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// USUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_USUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// USUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_USUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSubU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+// Parallel Add/Subtract (Saturating) instructions
+
+// QADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedAddS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// QADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedAddS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// QSUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedSubS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// QSUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedSubS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedAddU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedAddU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQSUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedSubU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQSUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedSaturatedSubU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// Parallel Add/Subtract (Halving) instructions
+
+// SHADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SHADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SHASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddSubS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SHSAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SHSUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubS8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SHSUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubS16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHADD8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHADD16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingAddSubU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHSAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHSUB8<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubU8(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UHSUB16<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.PackedHalvingSubU16(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp
new file mode 100644
index 0000000000..4bf594919e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp
@@ -0,0 +1,89 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// RBIT<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_RBIT(Cond cond, Reg d, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const IR::U32 swapped = ir.ByteReverseWord(ir.GetRegister(m));
+
+ // ((x & 0xF0F0F0F0) >> 4) | ((x & 0x0F0F0F0F) << 4)
+ const IR::U32 first_lsr = ir.LogicalShiftRight(ir.And(swapped, ir.Imm32(0xF0F0F0F0)), ir.Imm8(4));
+ const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(swapped, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4));
+ const IR::U32 corrected = ir.Or(first_lsl, first_lsr);
+
+ // ((x & 0x88888888) >> 3) | ((x & 0x44444444) >> 1) |
+ // ((x & 0x22222222) << 1) | ((x & 0x11111111) << 3)
+ const IR::U32 second_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x88888888)), ir.Imm8(3));
+ const IR::U32 third_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x44444444)), ir.Imm8(1));
+ const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x22222222)), ir.Imm8(1));
+ const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x11111111)), ir.Imm8(3));
+
+ const IR::U32 result = ir.Or(ir.Or(ir.Or(second_lsr, third_lsr), second_lsl), third_lsl);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// REV<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_REV(Cond cond, Reg d, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto result = ir.ByteReverseWord(ir.GetRegister(m));
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// REV16<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_REV16(Cond cond, Reg d, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto lo = ir.And(ir.LogicalShiftRight(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0x00FF00FF));
+ const auto hi = ir.And(ir.LogicalShiftLeft(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0xFF00FF00));
+ const auto result = ir.Or(lo, hi);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// REVSH<c> <Rd>, <Rm>
+bool TranslatorVisitor::arm_REVSH(Cond cond, Reg d, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(ir.GetRegister(m)));
+ ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half));
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp
new file mode 100644
index 0000000000..41db115044
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp
@@ -0,0 +1,285 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) {
+ return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) {
+ return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+// Saturation instructions
+
+// SSAT<c> <Rd>, #<imm>, <Rn>{, <shift>}
+bool TranslatorVisitor::arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()) + 1;
+ const auto shift = !sh ? ShiftType::LSL : ShiftType::ASR;
+ const auto operand = EmitImmShift(ir.GetRegister(n), shift, imm5, ir.GetCFlag());
+ const auto result = ir.SignedSaturation(operand.result, saturate_to);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// SSAT16<c> <Rd>, #<imm>, <Rn>
+bool TranslatorVisitor::arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()) + 1;
+ const auto lo_operand = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(ir.GetRegister(n)));
+ const auto hi_operand = ir.SignExtendHalfToWord(MostSignificantHalf(ir, ir.GetRegister(n)));
+ const auto lo_result = ir.SignedSaturation(lo_operand, saturate_to);
+ const auto hi_result = ir.SignedSaturation(hi_operand, saturate_to);
+
+ ir.SetRegister(d, Pack2x16To1x32(ir, lo_result.result, hi_result.result));
+ ir.OrQFlag(lo_result.overflow);
+ ir.OrQFlag(hi_result.overflow);
+ return true;
+}
+
+// USAT<c> <Rd>, #<imm5>, <Rn>{, <shift>}
+bool TranslatorVisitor::arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend());
+ const auto shift = !sh ? ShiftType::LSL : ShiftType::ASR;
+ const auto operand = EmitImmShift(ir.GetRegister(n), shift, imm5, ir.GetCFlag());
+ const auto result = ir.UnsignedSaturation(operand.result, saturate_to);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// USAT16<c> <Rd>, #<imm4>, <Rn>
+bool TranslatorVisitor::arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ // UnsignedSaturation takes a *signed* value as input, hence sign extension is required.
+ const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend());
+ const auto lo_operand = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(ir.GetRegister(n)));
+ const auto hi_operand = ir.SignExtendHalfToWord(MostSignificantHalf(ir, ir.GetRegister(n)));
+ const auto lo_result = ir.UnsignedSaturation(lo_operand, saturate_to);
+ const auto hi_result = ir.UnsignedSaturation(hi_operand, saturate_to);
+
+ ir.SetRegister(d, Pack2x16To1x32(ir, lo_result.result, hi_result.result));
+ ir.OrQFlag(lo_result.overflow);
+ ir.OrQFlag(hi_result.overflow);
+ return true;
+}
+
+// Saturated Add/Subtract instructions
+
+// QADD<c> <Rd>, <Rm>, <Rn>
+bool TranslatorVisitor::arm_QADD(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto a = ir.GetRegister(m);
+ const auto b = ir.GetRegister(n);
+ const auto result = ir.SignedSaturatedAddWithFlag(a, b);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// QSUB<c> <Rd>, <Rm>, <Rn>
+bool TranslatorVisitor::arm_QSUB(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto a = ir.GetRegister(m);
+ const auto b = ir.GetRegister(n);
+ const auto result = ir.SignedSaturatedSubWithFlag(a, b);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// QDADD<c> <Rd>, <Rm>, <Rn>
+bool TranslatorVisitor::arm_QDADD(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto a = ir.GetRegister(m);
+ const auto b = ir.GetRegister(n);
+ const auto doubled = ir.SignedSaturatedAddWithFlag(b, b);
+ ir.OrQFlag(doubled.overflow);
+
+ const auto result = ir.SignedSaturatedAddWithFlag(a, doubled.result);
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// QDSUB<c> <Rd>, <Rm>, <Rn>
+bool TranslatorVisitor::arm_QDSUB(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto a = ir.GetRegister(m);
+ const auto b = ir.GetRegister(n);
+ const auto doubled = ir.SignedSaturatedAddWithFlag(b, b);
+ ir.OrQFlag(doubled.overflow);
+
+ const auto result = ir.SignedSaturatedSubWithFlag(a, doubled.result);
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+// Parallel saturated instructions
+
+// QASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto diff = ir.SignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result;
+ const auto sum = ir.SignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, diff, sum);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// QSAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_QSAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto sum = ir.SignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result;
+ const auto diff = ir.SignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, sum, diff);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQASX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQASX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result;
+ const auto sum = ir.UnsignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, diff, sum);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// UQSAX<c> <Rd>, <Rn>, <Rm>
+bool TranslatorVisitor::arm_UQSAX(Cond cond, Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto sum = ir.UnsignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result;
+ const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, sum, diff);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp
new file mode 100644
index 0000000000..60110df891
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp
@@ -0,0 +1,121 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// CPS<effect> <iflags>{, #<mode>}
+// CPS #<mode>
+bool TranslatorVisitor::arm_CPS() {
+ return InterpretThisInstruction();
+}
+
+// MRS<c> <Rd>, <spec_reg>
+bool TranslatorVisitor::arm_MRS(Cond cond, Reg d) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.SetRegister(d, ir.GetCpsr());
+ return true;
+}
+
+// MSR<c> <spec_reg>, #<const>
+bool TranslatorVisitor::arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8) {
+ ASSERT_MSG(mask != 0, "Decode error");
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const bool write_nzcvq = mcl::bit::get_bit<3>(mask);
+ const bool write_g = mcl::bit::get_bit<2>(mask);
+ const bool write_e = mcl::bit::get_bit<1>(mask);
+ const u32 imm32 = ArmExpandImm(rotate, imm8);
+
+ if (write_nzcvq) {
+ ir.SetCpsrNZCVQ(ir.Imm32(imm32 & 0xF8000000));
+ }
+
+ if (write_g) {
+ ir.SetGEFlagsCompressed(ir.Imm32(imm32 & 0x000F0000));
+ }
+
+ if (write_e) {
+ const bool E = (imm32 & 0x00000200) != 0;
+ if (E != ir.current_location.EFlag()) {
+ ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(4).SetEFlag(E)});
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// MSR<c> <spec_reg>, <Rn>
+bool TranslatorVisitor::arm_MSR_reg(Cond cond, unsigned mask, Reg n) {
+ if (mask == 0) {
+ return UnpredictableInstruction();
+ }
+
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const bool write_nzcvq = mcl::bit::get_bit<3>(mask);
+ const bool write_g = mcl::bit::get_bit<2>(mask);
+ const bool write_e = mcl::bit::get_bit<1>(mask);
+ const auto value = ir.GetRegister(n);
+
+ if (!write_e) {
+ if (write_nzcvq) {
+ ir.SetCpsrNZCVQ(ir.And(value, ir.Imm32(0xF8000000)));
+ }
+
+ if (write_g) {
+ ir.SetGEFlagsCompressed(ir.And(value, ir.Imm32(0x000F0000)));
+ }
+ } else {
+ const u32 cpsr_mask = (write_nzcvq ? 0xF8000000 : 0) | (write_g ? 0x000F0000 : 0) | 0x00000200;
+ const auto old_cpsr = ir.And(ir.GetCpsr(), ir.Imm32(~cpsr_mask));
+ const auto new_cpsr = ir.And(value, ir.Imm32(cpsr_mask));
+ ir.SetCpsr(ir.Or(old_cpsr, new_cpsr));
+ ir.PushRSB(ir.current_location.AdvancePC(4));
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}});
+ return false;
+ }
+
+ return true;
+}
+
+// RFE{<amode>} <Rn>{!}
+bool TranslatorVisitor::arm_RFE() {
+ return InterpretThisInstruction();
+}
+
+// SETEND <endian_specifier>
+bool TranslatorVisitor::arm_SETEND(bool E) {
+ ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(4).SetEFlag(E)});
+ return false;
+}
+
+// SRS{<amode>} SP{!}, #<mode>
+bool TranslatorVisitor::arm_SRS() {
+ return InterpretThisInstruction();
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp
new file mode 100644
index 0000000000..e9f04b70e5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp
@@ -0,0 +1,439 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// CLREX
+bool TranslatorVisitor::arm_CLREX() {
+ ir.ClearExclusive();
+ return true;
+}
+
+// SWP<c> <Rt>, <Rt2>, [<Rn>]
+// TODO: UNDEFINED if current mode is Hypervisor
+bool TranslatorVisitor::arm_SWP(Cond cond, Reg n, Reg t, Reg t2) {
+ if (t == Reg::PC || t2 == Reg::PC || n == Reg::PC || n == t || n == t2) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ // TODO (HACK): Implement bus locking here
+ const auto data = ir.ReadMemory32(ir.GetRegister(n), IR::AccType::SWAP);
+ ir.WriteMemory32(ir.GetRegister(n), ir.GetRegister(t2), IR::AccType::SWAP);
+ // TODO: Alignment check
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// SWPB<c> <Rt>, <Rt2>, [<Rn>]
+// TODO: UNDEFINED if current mode is Hypervisor
+bool TranslatorVisitor::arm_SWPB(Cond cond, Reg n, Reg t, Reg t2) {
+ if (t == Reg::PC || t2 == Reg::PC || n == Reg::PC || n == t || n == t2) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ // TODO (HACK): Implement bus locking here
+ const auto data = ir.ReadMemory8(ir.GetRegister(n), IR::AccType::SWAP);
+ ir.WriteMemory8(ir.GetRegister(n), ir.LeastSignificantByte(ir.GetRegister(t2)), IR::AccType::SWAP);
+ // TODO: Alignment check
+ ir.SetRegister(t, ir.ZeroExtendByteToWord(data));
+ return true;
+}
+
+// LDA<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDA(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ReadMemory32(address, IR::AccType::ORDERED));
+ return true;
+}
+// LDAB<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDAB(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendToWord(ir.ReadMemory8(address, IR::AccType::ORDERED)));
+ return true;
+}
+// LDAH<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDAH(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendToWord(ir.ReadMemory16(address, IR::AccType::ORDERED)));
+ return true;
+}
+
+// LDAEX<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDAEX(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ExclusiveReadMemory32(address, IR::AccType::ORDERED));
+ return true;
+}
+
+// LDAEXB<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDAEXB(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendByteToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ORDERED)));
+ return true;
+}
+
+// LDAEXD<c> <Rt>, <Rt2>, [<Rn>]
+bool TranslatorVisitor::arm_LDAEXD(Cond cond, Reg n, Reg t) {
+ if (t == Reg::LR || t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ORDERED);
+ // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR
+ ir.SetRegister(t, lo);
+ ir.SetRegister(t + 1, hi);
+ return true;
+}
+
+// LDAEXH<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDAEXH(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendHalfToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ORDERED)));
+ return true;
+}
+
+// STL<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STL(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::ORDERED);
+ return true;
+}
+
+// STLB<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STLB(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::ORDERED);
+ return true;
+}
+
+// STLH<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STLH(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::ORDERED);
+ return true;
+}
+
+// STLEXB<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STLEXB(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantByte(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ORDERED);
+ ir.SetRegister(d, passed);
+ return true;
+}
+// STLEXD<c> <Rd>, <Rt>, <Rt2>, [<Rn>]
+bool TranslatorVisitor::arm_STLEXD(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::LR || static_cast<size_t>(t) % 2 == 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t || d == t + 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const Reg t2 = t + 1;
+ const auto address = ir.GetRegister(n);
+ const auto value_lo = ir.GetRegister(t);
+ const auto value_hi = ir.GetRegister(t2);
+ const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ORDERED);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// STLEXH<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STLEXH(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantHalf(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ORDERED);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// STLEX<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STLEX(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.GetRegister(t);
+ const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ORDERED);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// LDREX<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDREX(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ExclusiveReadMemory32(address, IR::AccType::ATOMIC));
+ return true;
+}
+
+// LDREXB<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDREXB(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendByteToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ATOMIC)));
+ return true;
+}
+
+// LDREXD<c> <Rt>, <Rt2>, [<Rn>]
+bool TranslatorVisitor::arm_LDREXD(Cond cond, Reg n, Reg t) {
+ if (t == Reg::LR || t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ATOMIC);
+ // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR
+ ir.SetRegister(t, lo);
+ ir.SetRegister(t + 1, hi);
+ return true;
+}
+
+// LDREXH<c> <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_LDREXH(Cond cond, Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ZeroExtendHalfToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ATOMIC)));
+ return true;
+}
+
+// STREX<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STREX(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.GetRegister(t);
+ const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// STREXB<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STREXB(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantByte(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// STREXD<c> <Rd>, <Rt>, <Rt2>, [<Rn>]
+bool TranslatorVisitor::arm_STREXD(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::LR || static_cast<size_t>(t) % 2 == 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t || d == t + 1) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const Reg t2 = t + 1;
+ const auto address = ir.GetRegister(n);
+ const auto value_lo = ir.GetRegister(t);
+ const auto value_hi = ir.GetRegister(t2);
+ const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+// STREXH<c> <Rd>, <Rt>, [<Rn>]
+bool TranslatorVisitor::arm_STREXH(Cond cond, Reg n, Reg d, Reg t) {
+ if (n == Reg::PC || d == Reg::PC || t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ if (!ArmConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantHalf(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp
new file mode 100644
index 0000000000..67a94a0e80
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp
@@ -0,0 +1,995 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+
+// LSLS <Rd>, <Rm>, #<imm5>
+bool TranslatorVisitor::thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d) {
+ const u8 shift_n = imm5.ZeroExtend<u8>();
+ if (shift_n == 0 && ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.LogicalShiftLeft(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// LSRS <Rd>, <Rm>, #<imm5>
+bool TranslatorVisitor::thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d) {
+ const u8 shift_n = imm5 != 0 ? imm5.ZeroExtend<u8>() : u8(32);
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.LogicalShiftRight(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// ASRS <Rd>, <Rm>, #<imm5>
+bool TranslatorVisitor::thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d) {
+ const u8 shift_n = imm5 != 0 ? imm5.ZeroExtend<u8>() : u8(32);
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.ArithmeticShiftRight(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// ADDS <Rd>, <Rn>, <Rm>
+// Note that it is not possible to encode Rd == R15.
+bool TranslatorVisitor::thumb16_ADD_reg_t1(Reg m, Reg n, Reg d) {
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// SUBS <Rd>, <Rn>, <Rm>
+// Note that it is not possible to encode Rd == R15.
+bool TranslatorVisitor::thumb16_SUB_reg(Reg m, Reg n, Reg d) {
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// ADDS <Rd>, <Rn>, #<imm3>
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d) {
+ const u32 imm32 = imm3.ZeroExtend();
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// SUBS <Rd>, <Rn>, #<imm3>
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d) {
+ const u32 imm32 = imm3.ZeroExtend();
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// MOVS <Rd>, #<imm8>
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_MOV_imm(Reg d, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend();
+ const auto result = ir.Imm32(imm32);
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// CMP <Rn>, #<imm8>
+bool TranslatorVisitor::thumb16_CMP_imm(Reg n, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend();
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// ADDS <Rdn>, #<imm8>
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend();
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// SUBS <Rd>, <Rn>, #<imm3>
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend();
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// ANDS <Rdn>, <Rm>
+// Note that it is not possible to encode Rdn == R15.
+bool TranslatorVisitor::thumb16_AND_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.And(ir.GetRegister(n), ir.GetRegister(m));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// EORS <Rdn>, <Rm>
+// Note that it is not possible to encode Rdn == R15.
+bool TranslatorVisitor::thumb16_EOR_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.Eor(ir.GetRegister(n), ir.GetRegister(m));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// LSLS <Rdn>, <Rm>
+bool TranslatorVisitor::thumb16_LSL_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m));
+ const auto apsr_c = ir.GetCFlag();
+ const auto result_carry = ir.LogicalShiftLeft(ir.GetRegister(n), shift_n, apsr_c);
+
+ ir.SetRegister(d, result_carry.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result_carry.result), result_carry.carry);
+ }
+ return true;
+}
+
+// LSRS <Rdn>, <Rm>
+bool TranslatorVisitor::thumb16_LSR_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m));
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.LogicalShiftRight(ir.GetRegister(n), shift_n, cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// ASRS <Rdn>, <Rm>
+bool TranslatorVisitor::thumb16_ASR_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m));
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.ArithmeticShiftRight(ir.GetRegister(n), shift_n, cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// ADCS <Rdn>, <Rm>
+// Note that it is not possible to encode Rd == R15.
+bool TranslatorVisitor::thumb16_ADC_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto aspr_c = ir.GetCFlag();
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), aspr_c);
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// SBCS <Rdn>, <Rm>
+// Note that it is not possible to encode Rd == R15.
+bool TranslatorVisitor::thumb16_SBC_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto aspr_c = ir.GetCFlag();
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), aspr_c);
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// RORS <Rdn>, <Rm>
+bool TranslatorVisitor::thumb16_ROR_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m));
+ const auto cpsr_c = ir.GetCFlag();
+ const auto result = ir.RotateRight(ir.GetRegister(n), shift_n, cpsr_c);
+
+ ir.SetRegister(d, result.result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry);
+ }
+ return true;
+}
+
+// TST <Rn>, <Rm>
+bool TranslatorVisitor::thumb16_TST_reg(Reg m, Reg n) {
+ const auto result = ir.And(ir.GetRegister(n), ir.GetRegister(m));
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ return true;
+}
+
+// RSBS <Rd>, <Rn>, #0
+// Rd can never encode R15.
+bool TranslatorVisitor::thumb16_RSB_imm(Reg n, Reg d) {
+ const auto result = ir.SubWithCarry(ir.Imm32(0), ir.GetRegister(n), ir.Imm1(1));
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+// CMP <Rn>, <Rm>
+bool TranslatorVisitor::thumb16_CMP_reg_t1(Reg m, Reg n) {
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1));
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// CMN <Rn>, <Rm>
+bool TranslatorVisitor::thumb16_CMN_reg(Reg m, Reg n) {
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0));
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// ORRS <Rdn>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_ORR_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.Or(ir.GetRegister(m), ir.GetRegister(n));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// MULS <Rdn>, <Rm>, <Rdn>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_MUL_reg(Reg n, Reg d_m) {
+ const Reg d = d_m;
+ const Reg m = d_m;
+ const auto result = ir.Mul(ir.GetRegister(m), ir.GetRegister(n));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// BICS <Rdn>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_BIC_reg(Reg m, Reg d_n) {
+ const Reg d = d_n;
+ const Reg n = d_n;
+ const auto result = ir.AndNot(ir.GetRegister(n), ir.GetRegister(m));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// MVNS <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_MVN_reg(Reg m, Reg d) {
+ const auto result = ir.Not(ir.GetRegister(m));
+
+ ir.SetRegister(d, result);
+ if (!ir.current_location.IT().IsInITBlock()) {
+ ir.SetCpsrNZ(ir.NZFrom(result));
+ }
+ return true;
+}
+
+// ADD <Rdn>, <Rm>
+bool TranslatorVisitor::thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo) {
+ const Reg d_n = d_n_hi ? (d_n_lo + 8) : d_n_lo;
+ const Reg n = d_n;
+ const Reg d = d_n;
+ if (n == Reg::PC && m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (d == Reg::PC && ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0));
+ if (d == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.ALUWritePC(result);
+ // Return to dispatch as we can't predict what PC is going to be. Stop compilation.
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ } else {
+ ir.SetRegister(d, result);
+ return true;
+ }
+}
+
+// CMP <Rn>, <Rm>
+bool TranslatorVisitor::thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo) {
+ const Reg n = n_hi ? (n_lo + 8) : n_lo;
+ if (n < Reg::R8 && m < Reg::R8) {
+ return UnpredictableInstruction();
+ }
+ if (n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1));
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+// MOV <Rd>, <Rm>
+bool TranslatorVisitor::thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo) {
+ const Reg d = d_hi ? (d_lo + 8) : d_lo;
+ if (d == Reg::PC && ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const auto result = ir.GetRegister(m);
+
+ if (d == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.ALUWritePC(result);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ } else {
+ ir.SetRegister(d, result);
+ return true;
+ }
+}
+
+// LDR <Rt>, <label>
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDR_literal(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const u32 address = ir.AlignPC(4) + imm32;
+ const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL);
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STR <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STR_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.GetRegister(t);
+
+ ir.WriteMemory32(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// STRH <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STRH_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.LeastSignificantHalf(ir.GetRegister(t));
+
+ ir.WriteMemory16(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// STRB <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STRB_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.LeastSignificantByte(ir.GetRegister(t));
+
+ ir.WriteMemory8(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// LDRSB <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDRSB_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDR <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDR_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRH <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDRH_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRB <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDRB_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// LDRH <Rt>, [<Rn>, <Rm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDRSH_reg(Reg m, Reg n, Reg t) {
+ const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m));
+ const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STR <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 2;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.GetRegister(t);
+
+ ir.WriteMemory32(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// LDR <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 2;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STRB <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend();
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.LeastSignificantByte(ir.GetRegister(t));
+
+ ir.WriteMemory8(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// LDRB <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend();
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STRH <Rt>, [<Rn>, #<imm5>]
+bool TranslatorVisitor::thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 1;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.LeastSignificantHalf(ir.GetRegister(t));
+
+ ir.WriteMemory16(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// LDRH <Rt>, [<Rn>, #<imm5>]
+bool TranslatorVisitor::thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t) {
+ const u32 imm32 = imm5.ZeroExtend() << 1;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// STR <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_STR_imm_t2(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const Reg n = Reg::SP;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.GetRegister(t);
+
+ ir.WriteMemory32(address, data, IR::AccType::NORMAL);
+ return true;
+}
+
+// LDR <Rt>, [<Rn>, #<imm>]
+// Rt cannot encode R15.
+bool TranslatorVisitor::thumb16_LDR_imm_t2(Reg t, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const Reg n = Reg::SP;
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32));
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+// ADR <Rd>, <label>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_ADR(Reg d, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const auto result = ir.Imm32(ir.AlignPC(4) + imm32);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// ADD <Rd>, SP, #<imm>
+bool TranslatorVisitor::thumb16_ADD_sp_t1(Reg d, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const auto result = ir.AddWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// ADD SP, SP, #<imm>
+bool TranslatorVisitor::thumb16_ADD_sp_t2(Imm<7> imm7) {
+ const u32 imm32 = imm7.ZeroExtend() << 2;
+ const Reg d = Reg::SP;
+ const auto result = ir.AddWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SUB SP, SP, #<imm>
+bool TranslatorVisitor::thumb16_SUB_sp(Imm<7> imm7) {
+ const u32 imm32 = imm7.ZeroExtend() << 2;
+ const Reg d = Reg::SP;
+ const auto result = ir.SubWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// SEV<c>
+bool TranslatorVisitor::thumb16_SEV() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::SendEvent);
+}
+
+// SEVL<c>
+bool TranslatorVisitor::thumb16_SEVL() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::SendEventLocal);
+}
+
+// WFE<c>
+bool TranslatorVisitor::thumb16_WFE() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::WaitForEvent);
+}
+
+// WFI<c>
+bool TranslatorVisitor::thumb16_WFI() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::WaitForInterrupt);
+}
+
+// YIELD<c>
+bool TranslatorVisitor::thumb16_YIELD() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::Yield);
+}
+
+// NOP<c>
+bool TranslatorVisitor::thumb16_NOP() {
+ return true;
+}
+
+// IT{<x>{<y>{<z>}}} <cond>
+bool TranslatorVisitor::thumb16_IT(Imm<8> imm8) {
+ ASSERT_MSG((imm8.Bits<0, 3>() != 0b0000), "Decode Error");
+ if (imm8.Bits<4, 7>() == 0b1111 || (imm8.Bits<4, 7>() == 0b1110 && mcl::bit::count_ones(imm8.Bits<0, 3>()) != 1)) {
+ return UnpredictableInstruction();
+ }
+ if (ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const auto next_location = ir.current_location.AdvancePC(2).SetIT(ITState{imm8.ZeroExtend<u8>()});
+ ir.SetTerm(IR::Term::LinkBlockFast{next_location});
+ return false;
+}
+
+// SXTH <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_SXTH(Reg m, Reg d) {
+ const auto half = ir.LeastSignificantHalf(ir.GetRegister(m));
+ ir.SetRegister(d, ir.SignExtendHalfToWord(half));
+ return true;
+}
+
+// SXTB <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_SXTB(Reg m, Reg d) {
+ const auto byte = ir.LeastSignificantByte(ir.GetRegister(m));
+ ir.SetRegister(d, ir.SignExtendByteToWord(byte));
+ return true;
+}
+
+// UXTH <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_UXTH(Reg m, Reg d) {
+ const auto half = ir.LeastSignificantHalf(ir.GetRegister(m));
+ ir.SetRegister(d, ir.ZeroExtendHalfToWord(half));
+ return true;
+}
+
+// UXTB <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_UXTB(Reg m, Reg d) {
+ const auto byte = ir.LeastSignificantByte(ir.GetRegister(m));
+ ir.SetRegister(d, ir.ZeroExtendByteToWord(byte));
+ return true;
+}
+
+// PUSH <reg_list>
+// reg_list cannot encode for R15.
+bool TranslatorVisitor::thumb16_PUSH(bool M, RegList reg_list) {
+ if (M) {
+ reg_list |= 1 << 14;
+ }
+ if (mcl::bit::count_ones(reg_list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 num_bytes_to_push = static_cast<u32>(4 * mcl::bit::count_ones(reg_list));
+ const auto final_address = ir.Sub(ir.GetRegister(Reg::SP), ir.Imm32(num_bytes_to_push));
+ auto address = final_address;
+ for (size_t i = 0; i < 16; i++) {
+ if (mcl::bit::get_bit(i, reg_list)) {
+ // TODO: Deal with alignment
+ const auto Ri = ir.GetRegister(static_cast<Reg>(i));
+ ir.WriteMemory32(address, Ri, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ ir.SetRegister(Reg::SP, final_address);
+ // TODO(optimization): Possible location for an RSB push.
+ return true;
+}
+
+// POP <reg_list>
+bool TranslatorVisitor::thumb16_POP(bool P, RegList reg_list) {
+ if (P) {
+ reg_list |= 1 << 15;
+ }
+ if (mcl::bit::count_ones(reg_list) < 1) {
+ return UnpredictableInstruction();
+ }
+
+ auto address = ir.GetRegister(Reg::SP);
+ for (size_t i = 0; i < 15; i++) {
+ if (mcl::bit::get_bit(i, reg_list)) {
+ // TODO: Deal with alignment
+ const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ ir.SetRegister(static_cast<Reg>(i), data);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ if (mcl::bit::get_bit<15>(reg_list)) {
+ // TODO(optimization): Possible location for an RSB pop.
+ const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(data);
+ address = ir.Add(address, ir.Imm32(4));
+ ir.SetRegister(Reg::SP, address);
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ return false;
+ } else {
+ ir.SetRegister(Reg::SP, address);
+ return true;
+ }
+}
+
+// SETEND <endianness>
+bool TranslatorVisitor::thumb16_SETEND(bool E) {
+ if (ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ if (E == ir.current_location.EFlag()) {
+ return true;
+ }
+
+ ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(2).SetEFlag(E).AdvanceIT()});
+ return false;
+}
+
+// CPS{IE,ID} <a,i,f>
+// A CPS is treated as a NOP in User mode.
+bool TranslatorVisitor::thumb16_CPS(bool, bool, bool, bool) {
+ return true;
+}
+
+// REV <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_REV(Reg m, Reg d) {
+ ir.SetRegister(d, ir.ByteReverseWord(ir.GetRegister(m)));
+ return true;
+}
+
+// REV16 <Rd>, <Rm>
+// Rd cannot encode R15.
+// TODO: Consider optimizing
+bool TranslatorVisitor::thumb16_REV16(Reg m, Reg d) {
+ const auto Rm = ir.GetRegister(m);
+ const auto upper_half = ir.LeastSignificantHalf(ir.LogicalShiftRight(Rm, ir.Imm8(16), ir.Imm1(0)).result);
+ const auto lower_half = ir.LeastSignificantHalf(Rm);
+ const auto rev_upper_half = ir.ZeroExtendHalfToWord(ir.ByteReverseHalf(upper_half));
+ const auto rev_lower_half = ir.ZeroExtendHalfToWord(ir.ByteReverseHalf(lower_half));
+ const auto result = ir.Or(ir.LogicalShiftLeft(rev_upper_half, ir.Imm8(16), ir.Imm1(0)).result,
+ rev_lower_half);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+// REVSH <Rd>, <Rm>
+// Rd cannot encode R15.
+bool TranslatorVisitor::thumb16_REVSH(Reg m, Reg d) {
+ const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(ir.GetRegister(m)));
+ ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half));
+ return true;
+}
+
+// BKPT #<imm8>
+bool TranslatorVisitor::thumb16_BKPT(Imm<8> /*imm8*/) {
+ return RaiseException(Exception::Breakpoint);
+}
+
+// STM <Rn>!, <reg_list>
+bool TranslatorVisitor::thumb16_STMIA(Reg n, RegList reg_list) {
+ if (mcl::bit::count_ones(reg_list) == 0) {
+ return UnpredictableInstruction();
+ }
+ if (mcl::bit::get_bit(static_cast<size_t>(n), reg_list) && n != static_cast<Reg>(mcl::bit::lowest_set_bit(reg_list))) {
+ return UnpredictableInstruction();
+ }
+
+ auto address = ir.GetRegister(n);
+ for (size_t i = 0; i < 8; i++) {
+ if (mcl::bit::get_bit(i, reg_list)) {
+ const auto Ri = ir.GetRegister(static_cast<Reg>(i));
+ ir.WriteMemory32(address, Ri, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ ir.SetRegister(n, address);
+ return true;
+}
+
+// LDM <Rn>!, <reg_list>
+bool TranslatorVisitor::thumb16_LDMIA(Reg n, RegList reg_list) {
+ if (mcl::bit::count_ones(reg_list) == 0) {
+ return UnpredictableInstruction();
+ }
+
+ const bool write_back = !mcl::bit::get_bit(static_cast<size_t>(n), reg_list);
+ auto address = ir.GetRegister(n);
+
+ for (size_t i = 0; i < 8; i++) {
+ if (mcl::bit::get_bit(i, reg_list)) {
+ const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ ir.SetRegister(static_cast<Reg>(i), data);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ if (write_back) {
+ ir.SetRegister(n, address);
+ }
+ return true;
+}
+
+// CB{N}Z <Rn>, <label>
+bool TranslatorVisitor::thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n) {
+ if (ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 imm = concatenate(i, imm5, Imm<1>{0}).ZeroExtend();
+ const IR::U32 rn = ir.GetRegister(n);
+
+ ir.SetCheckBit(ir.IsZero(rn));
+
+ const auto [cond_pass, cond_fail] = [this, imm, nonzero] {
+ const auto skip = IR::Term::LinkBlock{ir.current_location.AdvancePC(2).AdvanceIT()};
+ const auto branch = IR::Term::LinkBlock{ir.current_location.AdvancePC(imm + 4).AdvanceIT()};
+
+ if (nonzero) {
+ return std::make_pair(skip, branch);
+ } else {
+ return std::make_pair(branch, skip);
+ }
+ }();
+
+ ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail});
+ return false;
+}
+
+bool TranslatorVisitor::thumb16_UDF() {
+ return UndefinedInstruction();
+}
+
+// BX <Rm>
+bool TranslatorVisitor::thumb16_BX(Reg m) {
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ ir.UpdateUpperLocationDescriptor();
+ ir.BXWritePC(ir.GetRegister(m));
+ if (m == Reg::R14)
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ else
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+// BLX <Rm>
+bool TranslatorVisitor::thumb16_BLX_reg(Reg m) {
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ ir.PushRSB(ir.current_location.AdvancePC(2).AdvanceIT());
+ ir.UpdateUpperLocationDescriptor();
+ ir.BXWritePC(ir.GetRegister(m));
+ ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 2) | 1));
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+// SVC #<imm8>
+bool TranslatorVisitor::thumb16_SVC(Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend();
+ ir.PushRSB(ir.current_location.AdvancePC(2).AdvanceIT());
+ ir.UpdateUpperLocationDescriptor();
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 2));
+ ir.CallSupervisor(ir.Imm32(imm32));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}});
+ return false;
+}
+
+// B<cond> <label>
+bool TranslatorVisitor::thumb16_B_t1(Cond cond, Imm<8> imm8) {
+ if (ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ if (cond == Cond::AL) {
+ return thumb16_UDF();
+ }
+
+ const s32 imm32 = static_cast<s32>((imm8.SignExtend<u32>() << 1) + 4);
+ const auto then_location = ir.current_location.AdvancePC(imm32).AdvanceIT();
+ const auto else_location = ir.current_location.AdvancePC(2).AdvanceIT();
+
+ ir.SetTerm(IR::Term::If{cond, IR::Term::LinkBlock{then_location}, IR::Term::LinkBlock{else_location}});
+ return false;
+}
+
+// B <label>
+bool TranslatorVisitor::thumb16_B_t2(Imm<11> imm11) {
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const s32 imm32 = static_cast<s32>((imm11.SignExtend<u32>() << 1) + 4);
+ const auto next_location = ir.current_location.AdvancePC(imm32).AdvanceIT();
+
+ ir.SetTerm(IR::Term::LinkBlock{next_location});
+ return false;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp
new file mode 100644
index 0000000000..83734f5c70
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp
@@ -0,0 +1,88 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+// BL <label>
+bool TranslatorVisitor::thumb32_BL_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) {
+ const Imm<1> i1{j1 == S};
+ const Imm<1> i2{j2 == S};
+
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT());
+ ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 4) | 1));
+
+ const s32 imm32 = static_cast<s32>((concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1) + 4);
+ const auto new_location = ir.current_location
+ .AdvancePC(imm32)
+ .AdvanceIT();
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+// BLX <label>
+bool TranslatorVisitor::thumb32_BLX_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) {
+ const Imm<1> i1{j1 == S};
+ const Imm<1> i2{j2 == S};
+
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ if (lo.Bit<0>()) {
+ return UnpredictableInstruction();
+ }
+
+ ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT());
+ ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 4) | 1));
+
+ const s32 imm32 = static_cast<s32>(concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1);
+ const auto new_location = ir.current_location
+ .SetPC(ir.AlignPC(4) + imm32)
+ .SetTFlag(false)
+ .AdvanceIT();
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+bool TranslatorVisitor::thumb32_B(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) {
+ const Imm<1> i1{j1 == S};
+ const Imm<1> i2{j2 == S};
+
+ if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ const s32 imm32 = static_cast<s32>((concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1) + 4);
+ const auto new_location = ir.current_location
+ .AdvancePC(imm32)
+ .AdvanceIT();
+ ir.SetTerm(IR::Term::LinkBlock{new_location});
+ return false;
+}
+
+bool TranslatorVisitor::thumb32_B_cond(Imm<1> S, Cond cond, Imm<6> hi, Imm<1> i1, Imm<1> i2, Imm<11> lo) {
+ if (ir.current_location.IT().IsInITBlock()) {
+ return UnpredictableInstruction();
+ }
+
+ // Note: i1 and i2 were not inverted from encoding and are opposite compared to the other B instructions.
+ const s32 imm32 = static_cast<s32>((concatenate(S, i2, i1, hi, lo).SignExtend<u32>() << 1) + 4);
+ const auto then_location = ir.current_location
+ .AdvancePC(imm32)
+ .AdvanceIT();
+ const auto else_location = ir.current_location
+ .AdvancePC(4)
+ .AdvanceIT();
+ ir.SetTerm(IR::Term::If{cond, IR::Term::LinkBlock{then_location}, IR::Term::LinkBlock{else_location}});
+ return false;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp
new file mode 100644
index 0000000000..aa03937301
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp
@@ -0,0 +1,126 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_BXJ(Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ return thumb16_BX(m);
+}
+
+bool TranslatorVisitor::thumb32_CLREX() {
+ ir.ClearExclusive();
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_DMB(Imm<4> /*option*/) {
+ ir.DataMemoryBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_DSB(Imm<4> /*option*/) {
+ ir.DataSynchronizationBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ISB(Imm<4> /*option*/) {
+ ir.InstructionSynchronizationBarrier();
+ ir.UpdateUpperLocationDescriptor();
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+}
+
+bool TranslatorVisitor::thumb32_NOP() {
+ return thumb16_NOP();
+}
+
+bool TranslatorVisitor::thumb32_SEV() {
+ return thumb16_SEV();
+}
+
+bool TranslatorVisitor::thumb32_SEVL() {
+ return thumb16_SEVL();
+}
+
+bool TranslatorVisitor::thumb32_UDF() {
+ return thumb16_UDF();
+}
+
+bool TranslatorVisitor::thumb32_WFE() {
+ return thumb16_WFE();
+}
+
+bool TranslatorVisitor::thumb32_WFI() {
+ return thumb16_WFI();
+}
+
+bool TranslatorVisitor::thumb32_YIELD() {
+ return thumb16_YIELD();
+}
+
+bool TranslatorVisitor::thumb32_MSR_reg(bool write_spsr, Reg n, Imm<4> mask) {
+ if (mask == 0) {
+ return UnpredictableInstruction();
+ }
+
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (write_spsr) {
+ return UndefinedInstruction();
+ }
+
+ const bool write_nzcvq = mask.Bit<3>();
+ const bool write_g = mask.Bit<2>();
+ const bool write_e = mask.Bit<1>();
+ const auto value = ir.GetRegister(n);
+
+ if (!write_e) {
+ if (write_nzcvq) {
+ ir.SetCpsrNZCVQ(ir.And(value, ir.Imm32(0xF8000000)));
+ }
+
+ if (write_g) {
+ ir.SetGEFlagsCompressed(ir.And(value, ir.Imm32(0x000F0000)));
+ }
+ } else {
+ ir.UpdateUpperLocationDescriptor();
+
+ const u32 cpsr_mask = (write_nzcvq ? 0xF8000000 : 0) | (write_g ? 0x000F0000 : 0) | 0x00000200;
+ const auto old_cpsr = ir.And(ir.GetCpsr(), ir.Imm32(~cpsr_mask));
+ const auto new_cpsr = ir.And(value, ir.Imm32(cpsr_mask));
+ ir.SetCpsr(ir.Or(old_cpsr, new_cpsr));
+ ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT());
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}});
+ return false;
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MRS_reg(bool read_spsr, Reg d) {
+ if (d == Reg::R15) {
+ return UnpredictableInstruction();
+ }
+
+ // TODO: Revisit when implementing more than user mode.
+
+ if (read_spsr) {
+ return UndefinedInstruction();
+ }
+
+ ir.SetRegister(d, ir.GetCpsr());
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp
new file mode 100644
index 0000000000..ab7c539932
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp
@@ -0,0 +1,75 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_MCRR(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ ir.CoprocSendTwoWords(coproc_no, two, opc, CRm, ir.GetRegister(t), ir.GetRegister(t2));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MRRC(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) {
+ const auto two_words = ir.CoprocGetTwoWords(coproc_no, two, opc, CRm);
+ ir.SetRegister(t, ir.LeastSignificantWord(two_words));
+ ir.SetRegister(t2, ir.MostSignificantWord(two_words).result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const bool index = p;
+ const bool add = u;
+ const bool wback = w;
+ const bool has_option = !p && !w && u;
+ const IR::U32 reg_n = ir.GetRegister(n);
+ const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32));
+ const IR::U32 address = index ? offset_address : reg_n;
+ ir.CoprocStoreWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>());
+ if (wback) {
+ ir.SetRegister(n, offset_address);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) {
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const bool index = p;
+ const bool add = u;
+ const bool wback = w;
+ const bool has_option = !p && !w && u;
+ const IR::U32 reg_n = ir.GetRegister(n);
+ const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32));
+ const IR::U32 address = index ? offset_address : reg_n;
+ ir.CoprocLoadWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>());
+ if (wback) {
+ ir.SetRegister(n, offset_address);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_CDP(bool two, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ ir.CoprocInternalOperation(coproc_no, two, opc1, CRd, CRn, CRm, opc2);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MCR(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ ir.CoprocSendOneWord(coproc_no, two, opc1, CRn, CRm, opc2, ir.GetRegister(t));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MRC(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) {
+ const auto word = ir.CoprocGetOneWord(coproc_no, two, opc1, CRn, CRm, opc2);
+ if (t != Reg::PC) {
+ ir.SetRegister(t, word);
+ } else {
+ const auto new_cpsr_nzcv = ir.And(word, ir.Imm32(0xF0000000));
+ ir.SetCpsrNZCVRaw(new_cpsr_nzcv);
+ }
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp
new file mode 100644
index 0000000000..0c0114f84c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp
@@ -0,0 +1,244 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_TST_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_AND_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+ if ((d == Reg::PC && !S) || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_BIC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MOV_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Imm32(imm_carry.imm32);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ORR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(n != Reg::PC, "Decode error");
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MVN_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Imm32(~imm_carry.imm32);
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ORN_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(n != Reg::PC, "Decode error");
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(~imm_carry.imm32));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_TEQ_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_EOR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+ if ((d == Reg::PC && !S) || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_CMN_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADD_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+ if ((d == Reg::PC && !S) || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag());
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SBC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag());
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_CMP_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SUB_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+ if ((d == Reg::PC && !S) || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_RSB_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = ThumbExpandImm(i, imm3, imm8);
+ const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp
new file mode 100644
index 0000000000..1a6767dbee
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp
@@ -0,0 +1,232 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/assert.hpp>
+#include <mcl/bitsizeof.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) {
+ return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) {
+ return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+using SaturationFunction = IR::ResultAndOverflow<IR::U32> (IREmitter::*)(const IR::U32&, size_t);
+
+static bool Saturation(TranslatorVisitor& v, bool sh, Reg n, Reg d, Imm<5> shift_amount, size_t saturate_to, SaturationFunction sat_fn) {
+ ASSERT_MSG(!(sh && shift_amount == 0), "Invalid decode");
+
+ if (d == Reg::PC || n == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto shift = sh ? ShiftType::ASR : ShiftType::LSL;
+ const auto operand = v.EmitImmShift(v.ir.GetRegister(n), shift, shift_amount, v.ir.GetCFlag());
+ const auto result = (v.ir.*sat_fn)(operand.result, saturate_to);
+
+ v.ir.SetRegister(d, result.result);
+ v.ir.OrQFlag(result.overflow);
+ return true;
+}
+
+static bool Saturation16(TranslatorVisitor& v, Reg n, Reg d, size_t saturate_to, SaturationFunction sat_fn) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto reg_n = v.ir.GetRegister(n);
+
+ const auto lo_operand = v.ir.SignExtendHalfToWord(v.ir.LeastSignificantHalf(reg_n));
+ const auto hi_operand = v.ir.SignExtendHalfToWord(MostSignificantHalf(v.ir, reg_n));
+ const auto lo_result = (v.ir.*sat_fn)(lo_operand, saturate_to);
+ const auto hi_result = (v.ir.*sat_fn)(hi_operand, saturate_to);
+
+ v.ir.SetRegister(d, Pack2x16To1x32(v.ir, lo_result.result, hi_result.result));
+ v.ir.OrQFlag(lo_result.overflow);
+ v.ir.OrQFlag(hi_result.overflow);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADR_t2(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = concatenate(imm1, imm3, imm8).ZeroExtend();
+ const auto result = ir.AlignPC(4) - imm32;
+
+ ir.SetRegister(d, ir.Imm32(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADR_t3(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = concatenate(imm1, imm3, imm8).ZeroExtend();
+ const auto result = ir.AlignPC(4) + imm32;
+
+ ir.SetRegister(d, ir.Imm32(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADD_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 imm = concatenate(imm1, imm3, imm8).ZeroExtend();
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.AddWithCarry(reg_n, ir.Imm32(imm), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_BFC(Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsbit = concatenate(imm3, imm2).ZeroExtend();
+ const u32 msbit = msb.ZeroExtend();
+
+ if (msbit < lsbit) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 mask = ~(mcl::bit::ones<u32>(msbit - lsbit + 1) << lsbit);
+ const auto reg_d = ir.GetRegister(d);
+ const auto result = ir.And(reg_d, ir.Imm32(mask));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_BFI(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsbit = concatenate(imm3, imm2).ZeroExtend();
+ const u32 msbit = msb.ZeroExtend();
+
+ if (msbit < lsbit) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 inclusion_mask = mcl::bit::ones<u32>(msbit - lsbit + 1) << lsbit;
+ const u32 exclusion_mask = ~inclusion_mask;
+ const IR::U32 operand1 = ir.And(ir.GetRegister(d), ir.Imm32(exclusion_mask));
+ const IR::U32 operand2 = ir.And(ir.LogicalShiftLeft(ir.GetRegister(n), ir.Imm8(u8(lsbit))), ir.Imm32(inclusion_mask));
+ const IR::U32 result = ir.Or(operand1, operand2);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MOVT(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 imm16 = ir.Imm32(concatenate(imm4, imm1, imm3, imm8).ZeroExtend() << 16);
+ const IR::U32 operand = ir.GetRegister(d);
+ const IR::U32 result = ir.Or(ir.And(operand, ir.Imm32(0x0000FFFFU)), imm16);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MOVW_imm(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 imm = ir.Imm32(concatenate(imm4, imm1, imm3, imm8).ZeroExtend());
+
+ ir.SetRegister(d, imm);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsbit = concatenate(imm3, imm2).ZeroExtend();
+ const u32 widthm1_value = widthm1.ZeroExtend();
+ const u32 msb = lsbit + widthm1_value;
+ if (msb >= mcl::bitsizeof<u32>) {
+ return UnpredictableInstruction();
+ }
+
+ constexpr size_t max_width = mcl::bitsizeof<u32>;
+ const auto width = widthm1_value + 1;
+ const auto left_shift_amount = static_cast<u8>(max_width - width - lsbit);
+ const auto right_shift_amount = static_cast<u8>(max_width - width);
+ const auto operand = ir.GetRegister(n);
+ const auto tmp = ir.LogicalShiftLeft(operand, ir.Imm8(left_shift_amount));
+ const auto result = ir.ArithmeticShiftRight(tmp, ir.Imm8(right_shift_amount));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SSAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm) {
+ return Saturation(*this, sh, n, d, concatenate(imm3, imm2), sat_imm.ZeroExtend() + 1, &IREmitter::SignedSaturation);
+}
+
+bool TranslatorVisitor::thumb32_SSAT16(Reg n, Reg d, Imm<4> sat_imm) {
+ return Saturation16(*this, n, d, sat_imm.ZeroExtend() + 1, &IREmitter::SignedSaturation);
+}
+
+bool TranslatorVisitor::thumb32_SUB_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 imm = concatenate(imm1, imm3, imm8).ZeroExtend();
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.SubWithCarry(reg_n, ir.Imm32(imm), ir.Imm1(1));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1) {
+ if (d == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 lsbit = concatenate(imm3, imm2).ZeroExtend();
+ const u32 widthm1_value = widthm1.ZeroExtend();
+ const u32 msb = lsbit + widthm1_value;
+ if (msb >= mcl::bitsizeof<u32>) {
+ return UnpredictableInstruction();
+ }
+
+ const auto operand = ir.GetRegister(n);
+ const auto mask = ir.Imm32(mcl::bit::ones<u32>(widthm1_value + 1));
+ const auto result = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(u8(lsbit))), mask);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm) {
+ return Saturation(*this, sh, n, d, concatenate(imm3, imm2), sat_imm.ZeroExtend(), &IREmitter::UnsignedSaturation);
+}
+
+bool TranslatorVisitor::thumb32_USAT16(Reg n, Reg d, Imm<4> sat_imm) {
+ return Saturation16(*this, n, d, sat_imm.ZeroExtend(), &IREmitter::UnsignedSaturation);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp
new file mode 100644
index 0000000000..1dda532d92
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp
@@ -0,0 +1,206 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+IR::U32 Rotate(A32::IREmitter& ir, Reg m, SignExtendRotation rotate) {
+ const u8 rotate_by = static_cast<u8>(static_cast<size_t>(rotate) * 8);
+ return ir.RotateRight(ir.GetRegister(m), ir.Imm8(rotate_by), ir.Imm1(0)).result;
+}
+
+using ShiftFunction = IR::ResultAndCarry<IR::U32> (IREmitter::*)(const IR::U32&, const IR::U8&, const IR::U1&);
+
+bool ShiftInstruction(TranslatorVisitor& v, Reg m, Reg d, Reg s, bool S, ShiftFunction shift_fn) {
+ if (d == Reg::PC || m == Reg::PC || s == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto shift_s = v.ir.LeastSignificantByte(v.ir.GetRegister(s));
+ const auto apsr_c = v.ir.GetCFlag();
+ const auto result_carry = (v.ir.*shift_fn)(v.ir.GetRegister(m), shift_s, apsr_c);
+
+ if (S) {
+ v.ir.SetCpsrNZC(v.ir.NZFrom(result_carry.result), result_carry.carry);
+ }
+
+ v.ir.SetRegister(d, result_carry.result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::thumb32_ASR_reg(bool S, Reg m, Reg d, Reg s) {
+ return ShiftInstruction(*this, m, d, s, S, &IREmitter::ArithmeticShiftRight);
+}
+
+bool TranslatorVisitor::thumb32_LSL_reg(bool S, Reg m, Reg d, Reg s) {
+ return ShiftInstruction(*this, m, d, s, S, &IREmitter::LogicalShiftLeft);
+}
+
+bool TranslatorVisitor::thumb32_LSR_reg(bool S, Reg m, Reg d, Reg s) {
+ return ShiftInstruction(*this, m, d, s, S, &IREmitter::LogicalShiftRight);
+}
+
+bool TranslatorVisitor::thumb32_ROR_reg(bool S, Reg m, Reg d, Reg s) {
+ return ShiftInstruction(*this, m, d, s, S, &IREmitter::RotateRight);
+}
+
+bool TranslatorVisitor::thumb32_SXTB(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SXTB16(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080));
+ const auto result = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080));
+ const auto addend = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE)));
+ const auto result = ir.PackedAddU16(addend, ir.GetRegister(n)).result;
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SXTH(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTB(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTB16(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.And(rotated, ir.Imm32(0x00FF00FF));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ auto result = ir.And(rotated, ir.Imm32(0x00FF00FF));
+ const auto reg_n = ir.GetRegister(n);
+ result = ir.PackedAddU16(reg_n, result).result;
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTH(Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto result = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto rotated = Rotate(ir, m, rotate);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(reg_n, ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated)));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp
new file mode 100644
index 0000000000..3058350043
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp
@@ -0,0 +1,253 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_TST_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) {
+ if (n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_AND_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+
+ if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.And(ir.GetRegister(n), shifted.result);
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_BIC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MOV_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = shifted.result;
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ORR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(n != Reg::PC, "Decode error");
+
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.Or(ir.GetRegister(n), shifted.result);
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MVN_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.Not(shifted.result);
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ORN_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(n != Reg::PC, "Decode error");
+
+ if (d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.Or(ir.GetRegister(n), ir.Not(shifted.result));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_TEQ_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) {
+ if (n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_EOR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+
+ if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.Eor(ir.GetRegister(n), shifted.result);
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_PKH(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<1> tb, Reg m) {
+ const ShiftType type = concatenate(tb, Imm<1>{0}).ZeroExtend<ShiftType>();
+
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto operand2 = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()).result;
+ const auto lower = ir.And((tb == 1) ? operand2 : ir.GetRegister(n), ir.Imm32(0x0000FFFF));
+ const auto upper = ir.And((tb == 1) ? ir.GetRegister(n) : operand2, ir.Imm32(0xFFFF0000));
+ const auto result = ir.Or(upper, lower);
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_CMN_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) {
+ if (n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADD_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+
+ if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_ADC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SBC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag());
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_CMP_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) {
+ if (n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SUB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ ASSERT_MSG(!(d == Reg::PC && S), "Decode error");
+
+ if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_RSB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
+ const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1));
+ ir.SetRegister(d, result);
+ if (S) {
+ ir.SetCpsrNZCV(ir.NZCVFrom(result));
+ }
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp
new file mode 100644
index 0000000000..42b2ebf4aa
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp
@@ -0,0 +1,198 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+static bool PLDHandler(TranslatorVisitor& v, bool W) {
+ if (!v.options.hook_hint_instructions) {
+ return true;
+ }
+
+ const auto exception = W ? Exception::PreloadDataWithIntentToWrite
+ : Exception::PreloadData;
+ return v.RaiseException(exception);
+}
+
+static bool PLIHandler(TranslatorVisitor& v) {
+ if (!v.options.hook_hint_instructions) {
+ return true;
+ }
+
+ return v.RaiseException(Exception::PreloadInstruction);
+}
+
+using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::U8&);
+
+static bool LoadByteLiteral(TranslatorVisitor& v, bool U, Reg t, Imm<12> imm12, ExtensionFunction ext_fn) {
+ const u32 imm32 = imm12.ZeroExtend();
+ const u32 base = v.ir.AlignPC(4);
+ const u32 address = U ? (base + imm32) : (base - imm32);
+ const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory8(v.ir.Imm32(address), IR::AccType::NORMAL));
+
+ v.ir.SetRegister(t, data);
+ return true;
+}
+
+static bool LoadByteRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, ExtensionFunction ext_fn) {
+ if (m == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto reg_n = v.ir.GetRegister(n);
+ const auto reg_m = v.ir.GetRegister(m);
+ const auto offset = v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(imm2.ZeroExtend<u8>()));
+ const auto address = v.ir.Add(reg_n, offset);
+ const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ v.ir.SetRegister(t, data);
+ return true;
+}
+
+static bool LoadByteImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, ExtensionFunction ext_fn) {
+ const u32 imm32 = imm12.ZeroExtend();
+ const IR::U32 reg_n = v.ir.GetRegister(n);
+ const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32))
+ : v.ir.Sub(reg_n, v.ir.Imm32(imm32));
+ const IR::U32 address = P ? offset_address : reg_n;
+ const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory8(address, IR::AccType::NORMAL));
+
+ v.ir.SetRegister(t, data);
+ if (W) {
+ v.ir.SetRegister(n, offset_address);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_PLD_lit(bool /*U*/, Imm<12> /*imm12*/) {
+ return PLDHandler(*this, false);
+}
+
+bool TranslatorVisitor::thumb32_PLD_imm8(bool W, Reg /*n*/, Imm<8> /*imm8*/) {
+ return PLDHandler(*this, W);
+}
+
+bool TranslatorVisitor::thumb32_PLD_imm12(bool W, Reg /*n*/, Imm<12> /*imm12*/) {
+ return PLDHandler(*this, W);
+}
+
+bool TranslatorVisitor::thumb32_PLD_reg(bool W, Reg /*n*/, Imm<2> /*imm2*/, Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ return PLDHandler(*this, W);
+}
+
+bool TranslatorVisitor::thumb32_PLI_lit(bool /*U*/, Imm<12> /*imm12*/) {
+ return PLIHandler(*this);
+}
+
+bool TranslatorVisitor::thumb32_PLI_imm8(Reg /*n*/, Imm<8> /*imm8*/) {
+ return PLIHandler(*this);
+}
+
+bool TranslatorVisitor::thumb32_PLI_imm12(Reg /*n*/, Imm<12> /*imm12*/) {
+ return PLIHandler(*this);
+}
+
+bool TranslatorVisitor::thumb32_PLI_reg(Reg /*n*/, Imm<2> /*imm2*/, Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ return PLIHandler(*this);
+}
+
+bool TranslatorVisitor::thumb32_LDRB_lit(bool U, Reg t, Imm<12> imm12) {
+ return LoadByteLiteral(*this, U, t, imm12,
+ &IREmitter::ZeroExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) {
+ if (t == Reg::PC && W) {
+ return UnpredictableInstruction();
+ }
+ if (W && n == t) {
+ return UnpredictableInstruction();
+ }
+ if (!P && !W) {
+ return UndefinedInstruction();
+ }
+
+ return LoadByteImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()},
+ &IREmitter::ZeroExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRB_imm12(Reg n, Reg t, Imm<12> imm12) {
+ return LoadByteImmediate(*this, n, t, true, true, false, imm12,
+ &IREmitter::ZeroExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRB_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return LoadByteRegister(*this, n, t, imm2, m,
+ &IREmitter::ZeroExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRBT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution modes.
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat it as a normal LDRB, given we don't support
+ // execution levels other than EL0 currently.
+ return thumb32_LDRB_imm8(n, t, true, true, false, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDRSB_lit(bool U, Reg t, Imm<12> imm12) {
+ return LoadByteLiteral(*this, U, t, imm12,
+ &IREmitter::SignExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) {
+ if (t == Reg::PC && W) {
+ return UnpredictableInstruction();
+ }
+ if (W && n == t) {
+ return UnpredictableInstruction();
+ }
+ if (!P && !W) {
+ return UndefinedInstruction();
+ }
+
+ return LoadByteImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()},
+ &IREmitter::SignExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSB_imm12(Reg n, Reg t, Imm<12> imm12) {
+ return LoadByteImmediate(*this, n, t, true, true, false, imm12,
+ &IREmitter::SignExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSB_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return LoadByteRegister(*this, n, t, imm2, m,
+ &IREmitter::SignExtendByteToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSBT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution modes.
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat it as a normal LDRSB, given we don't support
+ // execution levels other than EL0 currently.
+ return thumb32_LDRSB_imm8(n, t, true, true, false, imm8);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp
new file mode 100644
index 0000000000..5b9f1639af
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp
@@ -0,0 +1,138 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::U16&);
+
+static bool LoadHalfLiteral(TranslatorVisitor& v, bool U, Reg t, Imm<12> imm12, ExtensionFunction ext_fn) {
+ const auto imm32 = imm12.ZeroExtend();
+ const auto base = v.ir.AlignPC(4);
+ const auto address = U ? (base + imm32) : (base - imm32);
+ const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory16(v.ir.Imm32(address), IR::AccType::NORMAL));
+
+ v.ir.SetRegister(t, data);
+ return true;
+}
+
+static bool LoadHalfRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, ExtensionFunction ext_fn) {
+ if (m == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const IR::U32 reg_m = v.ir.GetRegister(m);
+ const IR::U32 reg_n = v.ir.GetRegister(n);
+ const IR::U32 offset = v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(imm2.ZeroExtend<u8>()));
+ const IR::U32 address = v.ir.Add(reg_n, offset);
+ const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ v.ir.SetRegister(t, data);
+ return true;
+}
+
+static bool LoadHalfImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, ExtensionFunction ext_fn) {
+ const u32 imm32 = imm12.ZeroExtend();
+ const IR::U32 reg_n = v.ir.GetRegister(n);
+ const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32))
+ : v.ir.Sub(reg_n, v.ir.Imm32(imm32));
+ const IR::U32 address = P ? offset_address
+ : reg_n;
+ const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory16(address, IR::AccType::NORMAL));
+
+ if (W) {
+ v.ir.SetRegister(n, offset_address);
+ }
+
+ v.ir.SetRegister(t, data);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDRH_lit(bool U, Reg t, Imm<12> imm12) {
+ return LoadHalfLiteral(*this, U, t, imm12, &IREmitter::ZeroExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRH_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return LoadHalfRegister(*this, n, t, imm2, m, &IREmitter::ZeroExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) {
+ if (!P && !W) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC && W) {
+ return UnpredictableInstruction();
+ }
+ if (W && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ return LoadHalfImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()},
+ &IREmitter::ZeroExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRH_imm12(Reg n, Reg t, Imm<12> imm12) {
+ return LoadHalfImmediate(*this, n, t, true, true, false, imm12,
+ &IREmitter::ZeroExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRHT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat it as a normal LDRH, given we don't support
+ // execution levels other than EL0 currently.
+ return thumb32_LDRH_imm8(n, t, true, true, false, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDRSH_lit(bool U, Reg t, Imm<12> imm12) {
+ return LoadHalfLiteral(*this, U, t, imm12, &IREmitter::SignExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSH_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return LoadHalfRegister(*this, n, t, imm2, m, &IREmitter::SignExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) {
+ if (!P && !W) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC && W) {
+ return UnpredictableInstruction();
+ }
+ if (W && n == t) {
+ return UnpredictableInstruction();
+ }
+
+ return LoadHalfImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()},
+ &IREmitter::SignExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSH_imm12(Reg n, Reg t, Imm<12> imm12) {
+ return LoadHalfImmediate(*this, n, t, true, true, false, imm12,
+ &IREmitter::SignExtendHalfToWord);
+}
+
+bool TranslatorVisitor::thumb32_LDRSHT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat it as a normal LDRSH, given we don't support
+ // execution levels other than EL0 currently.
+ return thumb32_LDRSH_imm8(n, t, true, true, false, imm8);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp
new file mode 100644
index 0000000000..17d4285c23
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp
@@ -0,0 +1,292 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+static bool ITBlockCheck(const A32::IREmitter& ir) {
+ return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock();
+}
+
+static bool TableBranch(TranslatorVisitor& v, Reg n, Reg m, bool half) {
+ if (m == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+ if (ITBlockCheck(v.ir)) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto reg_m = v.ir.GetRegister(m);
+ const auto reg_n = v.ir.GetRegister(n);
+
+ IR::U32 halfwords;
+ if (half) {
+ const auto data = v.ir.ReadMemory16(v.ir.Add(reg_n, v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(1))), IR::AccType::NORMAL);
+ halfwords = v.ir.ZeroExtendToWord(data);
+ } else {
+ halfwords = v.ir.ZeroExtendToWord(v.ir.ReadMemory8(v.ir.Add(reg_n, reg_m), IR::AccType::NORMAL));
+ }
+
+ const auto current_pc = v.ir.Imm32(v.ir.PC());
+ const auto branch_value = v.ir.Add(current_pc, v.ir.Add(halfwords, halfwords));
+
+ v.ir.UpdateUpperLocationDescriptor();
+ v.ir.BranchWritePC(branch_value);
+ v.ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+static bool LoadDualImmediate(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ if (W && (n == t || n == t2)) {
+ return v.UnpredictableInstruction();
+ }
+ if (t == Reg::PC || t2 == Reg::PC || t == t2) {
+ return v.UnpredictableInstruction();
+ }
+
+ const u32 imm = imm8.ZeroExtend() << 2;
+ const IR::U32 reg_n = v.ir.GetRegister(n);
+ const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm))
+ : v.ir.Sub(reg_n, v.ir.Imm32(imm));
+ const IR::U32 address = P ? offset_address : reg_n;
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ const IR::U64 data = v.ir.ReadMemory64(address, IR::AccType::ATOMIC);
+
+ if (v.ir.current_location.EFlag()) {
+ v.ir.SetRegister(t, v.ir.MostSignificantWord(data).result);
+ v.ir.SetRegister(t2, v.ir.LeastSignificantWord(data));
+ } else {
+ v.ir.SetRegister(t, v.ir.LeastSignificantWord(data));
+ v.ir.SetRegister(t2, v.ir.MostSignificantWord(data).result);
+ }
+
+ if (W) {
+ v.ir.SetRegister(n, offset_address);
+ }
+ return true;
+}
+
+static bool LoadDualLiteral(TranslatorVisitor& v, bool U, bool W, Reg t, Reg t2, Imm<8> imm8) {
+ if (t == Reg::PC || t2 == Reg::PC || t == t2) {
+ return v.UnpredictableInstruction();
+ }
+ if (W) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto imm = imm8.ZeroExtend() << 2;
+ const auto address = U ? v.ir.Add(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm))
+ : v.ir.Sub(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm));
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ const IR::U64 data = v.ir.ReadMemory64(address, IR::AccType::ATOMIC);
+
+ if (v.ir.current_location.EFlag()) {
+ v.ir.SetRegister(t, v.ir.MostSignificantWord(data).result);
+ v.ir.SetRegister(t2, v.ir.LeastSignificantWord(data));
+ } else {
+ v.ir.SetRegister(t, v.ir.LeastSignificantWord(data));
+ v.ir.SetRegister(t2, v.ir.MostSignificantWord(data).result);
+ }
+
+ return true;
+}
+
+static bool StoreDual(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ if (W && (n == t || n == t2)) {
+ return v.UnpredictableInstruction();
+ }
+ if (n == Reg::PC || t == Reg::PC || t2 == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const u32 imm = imm8.ZeroExtend() << 2;
+ const IR::U32 reg_n = v.ir.GetRegister(n);
+ const IR::U32 reg_t = v.ir.GetRegister(t);
+ const IR::U32 reg_t2 = v.ir.GetRegister(t2);
+
+ const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm))
+ : v.ir.Sub(reg_n, v.ir.Imm32(imm));
+ const IR::U32 address = P ? offset_address : reg_n;
+
+ const IR::U64 data = v.ir.current_location.EFlag() ? v.ir.Pack2x32To1x64(reg_t2, reg_t)
+ : v.ir.Pack2x32To1x64(reg_t, reg_t2);
+
+ // NOTE: If alignment is exactly off by 4, each word is an atomic access.
+ v.ir.WriteMemory64(address, data, IR::AccType::ATOMIC);
+
+ if (W) {
+ v.ir.SetRegister(n, offset_address);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDA(Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.SetRegister(t, ir.ReadMemory32(address, IR::AccType::ORDERED));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ return LoadDualImmediate(*this, true, U, W, n, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8) {
+ return LoadDualLiteral(*this, U, true, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8) {
+ return LoadDualLiteral(*this, U, W, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ return StoreDual(*this, false, U, true, n, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) {
+ return StoreDual(*this, true, U, W, n, t, t2, imm8);
+}
+
+bool TranslatorVisitor::thumb32_LDREX(Reg n, Reg t, Imm<8> imm8) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm8.ZeroExtend() << 2));
+ const auto value = ir.ExclusiveReadMemory32(address, IR::AccType::ATOMIC);
+
+ ir.SetRegister(t, value);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDREXB(Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.ZeroExtendToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ATOMIC));
+
+ ir.SetRegister(t, value);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDREXD(Reg n, Reg t, Reg t2) {
+ if (t == Reg::PC || t2 == Reg::PC || t == t2 || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ATOMIC);
+
+ // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR
+ ir.SetRegister(t, lo);
+ ir.SetRegister(t2, hi);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDREXH(Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.ZeroExtendToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ATOMIC));
+
+ ir.SetRegister(t, value);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STL(Reg n, Reg t) {
+ if (t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::ORDERED);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) {
+ if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm8.ZeroExtend() << 2));
+ const auto value = ir.GetRegister(t);
+ const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STREXB(Reg n, Reg t, Reg d) {
+ if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantByte(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d) {
+ if (d == Reg::PC || t == Reg::PC || t2 == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (d == n || d == t || d == t2) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value_lo = ir.GetRegister(t);
+ const auto value_hi = ir.GetRegister(t2);
+ const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STREXH(Reg n, Reg t, Reg d) {
+ if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (d == n || d == t) {
+ return UnpredictableInstruction();
+ }
+
+ const auto address = ir.GetRegister(n);
+ const auto value = ir.LeastSignificantHalf(ir.GetRegister(t));
+ const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ATOMIC);
+ ir.SetRegister(d, passed);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_TBB(Reg n, Reg m) {
+ return TableBranch(*this, n, m, false);
+}
+
+bool TranslatorVisitor::thumb32_TBH(Reg n, Reg m) {
+ return TableBranch(*this, n, m, true);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp
new file mode 100644
index 0000000000..2bc782b973
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp
@@ -0,0 +1,149 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+static bool ITBlockCheck(const A32::IREmitter& ir) {
+ return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock();
+}
+
+static bool LDMHelper(A32::IREmitter& ir, bool W, Reg n, u32 list, const IR::U32& start_address, const IR::U32& writeback_address) {
+ auto address = start_address;
+ for (size_t i = 0; i <= 14; i++) {
+ if (mcl::bit::get_bit(i, list)) {
+ ir.SetRegister(static_cast<Reg>(i), ir.ReadMemory32(address, IR::AccType::ATOMIC));
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+ if (W && !mcl::bit::get_bit(RegNumber(n), list)) {
+ ir.SetRegister(n, writeback_address);
+ }
+ if (mcl::bit::get_bit<15>(list)) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(ir.ReadMemory32(address, IR::AccType::ATOMIC));
+ if (n == Reg::R13) {
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ } else {
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ }
+ return false;
+ }
+ return true;
+}
+
+static bool STMHelper(A32::IREmitter& ir, bool W, Reg n, u32 list, const IR::U32& start_address, const IR::U32& writeback_address) {
+ auto address = start_address;
+ for (size_t i = 0; i <= 14; i++) {
+ if (mcl::bit::get_bit(i, list)) {
+ ir.WriteMemory32(address, ir.GetRegister(static_cast<Reg>(i)), IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+ if (W) {
+ ir.SetRegister(n, writeback_address);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDMDB(bool W, Reg n, Imm<16> reg_list) {
+ const auto regs_imm = reg_list.ZeroExtend();
+ const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm));
+
+ if (n == Reg::PC || num_regs < 2) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<15>() && reg_list.Bit<14>()) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<13>()) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<15>() && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ // Start address is the same as the writeback address.
+ const IR::U32 start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(4 * num_regs));
+ return LDMHelper(ir, W, n, regs_imm, start_address, start_address);
+}
+
+bool TranslatorVisitor::thumb32_LDMIA(bool W, Reg n, Imm<16> reg_list) {
+ const auto regs_imm = reg_list.ZeroExtend();
+ const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm));
+
+ if (n == Reg::PC || num_regs < 2) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<15>() && reg_list.Bit<14>()) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<13>()) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<15>() && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ const auto start_address = ir.GetRegister(n);
+ const auto writeback_address = ir.Add(start_address, ir.Imm32(num_regs * 4));
+ return LDMHelper(ir, W, n, regs_imm, start_address, writeback_address);
+}
+
+bool TranslatorVisitor::thumb32_POP(Imm<16> reg_list) {
+ return thumb32_LDMIA(true, Reg::SP, reg_list);
+}
+
+bool TranslatorVisitor::thumb32_PUSH(Imm<15> reg_list) {
+ return thumb32_STMDB(true, Reg::SP, reg_list);
+}
+
+bool TranslatorVisitor::thumb32_STMIA(bool W, Reg n, Imm<15> reg_list) {
+ const auto regs_imm = reg_list.ZeroExtend();
+ const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm));
+
+ if (n == Reg::PC || num_regs < 2) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<13>()) {
+ return UnpredictableInstruction();
+ }
+
+ const auto start_address = ir.GetRegister(n);
+ const auto writeback_address = ir.Add(start_address, ir.Imm32(num_regs * 4));
+ return STMHelper(ir, W, n, regs_imm, start_address, writeback_address);
+}
+
+bool TranslatorVisitor::thumb32_STMDB(bool W, Reg n, Imm<15> reg_list) {
+ const auto regs_imm = reg_list.ZeroExtend();
+ const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm));
+
+ if (n == Reg::PC || num_regs < 2) {
+ return UnpredictableInstruction();
+ }
+ if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) {
+ return UnpredictableInstruction();
+ }
+ if (reg_list.Bit<13>()) {
+ return UnpredictableInstruction();
+ }
+
+ // Start address is the same as the writeback address.
+ const IR::U32 start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(4 * num_regs));
+ return STMHelper(ir, W, n, regs_imm, start_address, start_address);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp
new file mode 100644
index 0000000000..b92e27fc66
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp
@@ -0,0 +1,134 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+static bool ITBlockCheck(const A32::IREmitter& ir) {
+ return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock();
+}
+
+bool TranslatorVisitor::thumb32_LDR_lit(bool U, Reg t, Imm<12> imm12) {
+ if (t == Reg::PC && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 imm32 = imm12.ZeroExtend();
+ const u32 base = ir.AlignPC(4);
+ const u32 address = U ? base + imm32 : base - imm32;
+ const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(data);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDR_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) {
+ if (!P && !W) {
+ return UndefinedInstruction();
+ }
+ if (W && n == t) {
+ return UnpredictableInstruction();
+ }
+ if (t == Reg::PC && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ const u32 imm32 = imm8.ZeroExtend();
+ const IR::U32 reg_n = ir.GetRegister(n);
+ const IR::U32 offset_address = U ? ir.Add(reg_n, ir.Imm32(imm32))
+ : ir.Sub(reg_n, ir.Imm32(imm32));
+ const IR::U32 address = P ? offset_address
+ : reg_n;
+ const IR::U32 data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ if (W) {
+ ir.SetRegister(n, offset_address);
+ }
+
+ if (t == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(data);
+
+ if (!P && W && n == Reg::R13) {
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ } else {
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ }
+
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDR_imm12(Reg n, Reg t, Imm<12> imm12) {
+ if (t == Reg::PC && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ const auto imm32 = imm12.ZeroExtend();
+ const auto reg_n = ir.GetRegister(n);
+ const auto address = ir.Add(reg_n, ir.Imm32(imm32));
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(data);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDR_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ if (m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ if (t == Reg::PC && ITBlockCheck(ir)) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto offset = ir.LogicalShiftLeft(reg_m, ir.Imm8(imm2.ZeroExtend<u8>()));
+ const auto address = ir.Add(reg_n, offset);
+ const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL);
+
+ if (t == Reg::PC) {
+ ir.UpdateUpperLocationDescriptor();
+ ir.LoadWritePC(data);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ }
+
+ ir.SetRegister(t, data);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_LDRT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat it as a normal LDR, given we don't support
+ // execution levels other than EL0 currently.
+ return thumb32_LDR_imm8(n, t, true, true, false, imm8);
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp
new file mode 100644
index 0000000000..0a0cdd9f2e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp
@@ -0,0 +1,222 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+namespace {
+using DivideFunction = IR::U32U64 (IREmitter::*)(const IR::U32U64&, const IR::U32U64&);
+
+bool DivideOperation(TranslatorVisitor& v, Reg d, Reg m, Reg n, DivideFunction fn) {
+ if (d == Reg::PC || m == Reg::PC || n == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const IR::U32 operand1 = v.ir.GetRegister(n);
+ const IR::U32 operand2 = v.ir.GetRegister(m);
+ const IR::U32 result = (v.ir.*fn)(operand1, operand2);
+
+ v.ir.SetRegister(d, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::thumb32_SDIV(Reg n, Reg d, Reg m) {
+ return DivideOperation(*this, d, m, n, &IREmitter::SignedDiv);
+}
+
+bool TranslatorVisitor::thumb32_SMLAL(Reg n, Reg dLo, Reg dHi, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto product = ir.Mul(n64, m64);
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(product, addend);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLALD(Reg n, Reg dLo, Reg dHi, bool M, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo));
+ const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(ir.Add(product_lo, product_hi), addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLALXY(Reg n, Reg dLo, Reg dHi, bool N, bool M, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const IR::U64 product = ir.SignExtendWordToLong(ir.Mul(n16, m16));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(product, addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLSLD(Reg n, Reg dLo, Reg dHi, bool M, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo));
+ const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi));
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(ir.Sub(product_lo, product_hi), addend);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMULL(Reg n, Reg dLo, Reg dHi, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Mul(n64, m64);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UDIV(Reg n, Reg d, Reg m) {
+ return DivideOperation(*this, d, m, n, &IREmitter::UnsignedDiv);
+}
+
+bool TranslatorVisitor::thumb32_UMLAL(Reg n, Reg dLo, Reg dHi, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto product = ir.Mul(n64, m64);
+ const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi));
+ const auto result = ir.Add(product, addend);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UMULL(Reg n, Reg dLo, Reg dHi, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Mul(n64, m64);
+ const auto lo = ir.LeastSignificantWord(result);
+ const auto hi = ir.MostSignificantWord(result).result;
+
+ ir.SetRegister(dLo, lo);
+ ir.SetRegister(dHi, hi);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UMAAL(Reg n, Reg dLo, Reg dHi, Reg m) {
+ if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (dHi == dLo) {
+ return UnpredictableInstruction();
+ }
+
+ const auto lo64 = ir.ZeroExtendWordToLong(ir.GetRegister(dLo));
+ const auto hi64 = ir.ZeroExtendWordToLong(ir.GetRegister(dHi));
+ const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m));
+ const auto result = ir.Add(ir.Add(ir.Mul(n64, m64), hi64), lo64);
+
+ ir.SetRegister(dLo, ir.LeastSignificantWord(result));
+ ir.SetRegister(dHi, ir.MostSignificantWord(result).result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp
new file mode 100644
index 0000000000..9edb2310b9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp
@@ -0,0 +1,157 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_CLZ(Reg n, Reg d, Reg m) {
+ if (m != n || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto result = ir.CountLeadingZeros(reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QADD(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.SignedSaturatedAddWithFlag(reg_m, reg_n);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QDADD(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto doubled_n = ir.SignedSaturatedAddWithFlag(reg_n, reg_n);
+ ir.OrQFlag(doubled_n.overflow);
+
+ const auto result = ir.SignedSaturatedAddWithFlag(reg_m, doubled_n.result);
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QDSUB(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto doubled_n = ir.SignedSaturatedAddWithFlag(reg_n, reg_n);
+ ir.OrQFlag(doubled_n.overflow);
+
+ const auto result = ir.SignedSaturatedSubWithFlag(reg_m, doubled_n.result);
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QSUB(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.SignedSaturatedSubWithFlag(reg_m, reg_n);
+
+ ir.SetRegister(d, result.result);
+ ir.OrQFlag(result.overflow);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_RBIT(Reg n, Reg d, Reg m) {
+ if (m != n || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 swapped = ir.ByteReverseWord(ir.GetRegister(m));
+
+ // ((x & 0xF0F0F0F0) >> 4) | ((x & 0x0F0F0F0F) << 4)
+ const IR::U32 first_lsr = ir.LogicalShiftRight(ir.And(swapped, ir.Imm32(0xF0F0F0F0)), ir.Imm8(4));
+ const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(swapped, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4));
+ const IR::U32 corrected = ir.Or(first_lsl, first_lsr);
+
+ // ((x & 0x88888888) >> 3) | ((x & 0x44444444) >> 1) |
+ // ((x & 0x22222222) << 1) | ((x & 0x11111111) << 3)
+ const IR::U32 second_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x88888888)), ir.Imm8(3));
+ const IR::U32 third_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x44444444)), ir.Imm8(1));
+ const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x22222222)), ir.Imm8(1));
+ const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x11111111)), ir.Imm8(3));
+
+ const IR::U32 result = ir.Or(ir.Or(ir.Or(second_lsr, third_lsr), second_lsl), third_lsl);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_REV(Reg n, Reg d, Reg m) {
+ if (m != n || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto result = ir.ByteReverseWord(ir.GetRegister(m));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_REV16(Reg n, Reg d, Reg m) {
+ if (m != n || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto lo = ir.And(ir.LogicalShiftRight(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0x00FF00FF));
+ const auto hi = ir.And(ir.LogicalShiftLeft(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0xFF00FF00));
+ const auto result = ir.Or(lo, hi);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_REVSH(Reg n, Reg d, Reg m) {
+ if (m != n || d == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(reg_m));
+
+ ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SEL(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSelect(ir.GetGEFlags(), reg_m, reg_n);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp
new file mode 100644
index 0000000000..587cbaafdb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp
@@ -0,0 +1,312 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+bool TranslatorVisitor::thumb32_MLA(Reg n, Reg a, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_a = ir.GetRegister(a);
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Add(ir.Mul(reg_n, reg_m), reg_a);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MLS(Reg n, Reg a, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_a = ir.GetRegister(a);
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Sub(reg_a, ir.Mul(reg_n, reg_m));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_MUL(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.Mul(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLAD(Reg n, Reg a, Reg d, bool X, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (X) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const IR::U32 addend = ir.GetRegister(a);
+
+ auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0));
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ result = ir.AddWithCarry(result, addend, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLSD(Reg n, Reg a, Reg d, bool X, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (X) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const IR::U32 addend = ir.GetRegister(a);
+ const IR::U32 product = ir.Sub(product_lo, product_hi);
+ auto result = ir.AddWithCarry(product, addend, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLAXY(Reg n, Reg a, Reg d, bool N, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const IR::U32 product = ir.Mul(n16, m16);
+ const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMMLA(Reg n, Reg a, Reg d, bool R, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a));
+ const auto temp = ir.Add(a64, ir.Mul(n64, m64));
+ const auto result_carry = ir.MostSignificantWord(temp);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMMLS(Reg n, Reg a, Reg d, bool R, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a));
+ const auto temp = ir.Sub(a64, ir.Mul(n64, m64));
+ const auto result_carry = ir.MostSignificantWord(temp);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMMUL(Reg n, Reg d, bool R, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m));
+ const auto product = ir.Mul(n64, m64);
+ const auto result_carry = ir.MostSignificantWord(product);
+ auto result = result_carry.result;
+ if (R) {
+ result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry);
+ }
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMUAD(Reg n, Reg d, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMUSD(Reg n, Reg d, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U32 n32 = ir.GetRegister(n);
+ const IR::U32 m32 = ir.GetRegister(m);
+ const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result;
+
+ IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ if (M) {
+ std::swap(m_lo, m_hi);
+ }
+
+ const IR::U32 product_lo = ir.Mul(n_lo, m_lo);
+ const IR::U32 product_hi = ir.Mul(n_hi, m_hi);
+ const IR::U32 result = ir.Sub(product_lo, product_hi);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMULXY(Reg n, Reg d, bool N, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto n32 = ir.GetRegister(n);
+ const auto m32 = ir.GetRegister(m);
+ const auto n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32));
+ const auto m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result
+ : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32));
+ const auto result = ir.Mul(n16, m16);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMLAWY(Reg n, Reg a, Reg d, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ IR::U32 m32 = ir.GetRegister(m);
+ if (M) {
+ m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ }
+ const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)));
+ const auto product = ir.LeastSignificantWord(ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16)));
+ const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ ir.OrQFlag(ir.GetOverflowFrom(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SMULWY(Reg n, Reg d, bool M, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n));
+ IR::U32 m32 = ir.GetRegister(m);
+ if (M) {
+ m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result;
+ }
+ const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)));
+ const auto result = ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16));
+
+ ir.SetRegister(d, ir.LeastSignificantWord(result));
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USAD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAbsDiffSumU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USADA8(Reg n, Reg a, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_a = ir.GetRegister(a);
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto tmp = ir.PackedAbsDiffSumU8(reg_n, reg_m);
+ const auto result = ir.AddWithCarry(reg_a, tmp, ir.Imm1(0));
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp
new file mode 100644
index 0000000000..654940967d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp
@@ -0,0 +1,521 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) {
+ return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) {
+ return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result);
+}
+
+bool TranslatorVisitor::thumb32_SADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddSubS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SSAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubAddS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SSUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SSUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedAddSubU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubAddU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_USUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSubU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result.result);
+ ir.SetGEFlags(result.ge);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedAddS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedAddS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto diff = ir.SignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result;
+ const auto sum = ir.SignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, diff, sum);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QSAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto sum = ir.SignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result;
+ const auto diff = ir.SignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, sum, diff);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QSUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedSubS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_QSUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedSubS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedAddU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedAddU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result;
+ const auto sum = ir.UnsignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, diff, sum);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQSAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto Rn = ir.GetRegister(n);
+ const auto Rm = ir.GetRegister(m);
+ const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn));
+ const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn));
+ const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm));
+ const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm));
+ const auto sum = ir.UnsignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result;
+ const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result;
+ const auto result = Pack2x16To1x32(ir, sum, diff);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQSUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedSubU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UQSUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedSaturatedSubU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddSubS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHSAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubAddS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHSUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubS8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_SHSUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubS16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHADD8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHADD16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHASX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingAddSubU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHSAX(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubAddU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHSUB8(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubU8(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_UHSUB16(Reg n, Reg d, Reg m) {
+ if (d == Reg::PC || n == Reg::PC || m == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ const auto reg_m = ir.GetRegister(m);
+ const auto reg_n = ir.GetRegister(n);
+ const auto result = ir.PackedHalvingSubU16(reg_n, reg_m);
+
+ ir.SetRegister(d, result);
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp
new file mode 100644
index 0000000000..93c383757e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp
@@ -0,0 +1,223 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+template<typename StoreRegFn>
+static bool StoreRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, StoreRegFn store_fn) {
+ if (n == Reg::PC) {
+ return v.UndefinedInstruction();
+ }
+
+ if (t == Reg::PC || m == Reg::PC) {
+ return v.UnpredictableInstruction();
+ }
+
+ const auto reg_m = v.ir.GetRegister(m);
+ const auto reg_n = v.ir.GetRegister(n);
+ const auto reg_t = v.ir.GetRegister(t);
+
+ const auto shift_amount = v.ir.Imm8(static_cast<u8>(imm2.ZeroExtend()));
+ const auto offset = v.ir.LogicalShiftLeft(reg_m, shift_amount);
+ const auto offset_address = v.ir.Add(reg_n, offset);
+
+ store_fn(offset_address, reg_t);
+ return true;
+}
+
+using StoreImmFn = void (*)(TranslatorVisitor&, const IR::U32&, const IR::U32&);
+
+static void StoreImmByteFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) {
+ v.ir.WriteMemory8(address, v.ir.LeastSignificantByte(data), IR::AccType::NORMAL);
+}
+
+static void StoreImmHalfFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) {
+ v.ir.WriteMemory16(address, v.ir.LeastSignificantHalf(data), IR::AccType::NORMAL);
+}
+
+static void StoreImmWordFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) {
+ v.ir.WriteMemory32(address, data, IR::AccType::NORMAL);
+}
+
+static bool StoreImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, StoreImmFn store_fn) {
+ const auto imm32 = imm12.ZeroExtend();
+ const auto reg_n = v.ir.GetRegister(n);
+ const auto reg_t = v.ir.GetRegister(t);
+
+ const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32))
+ : v.ir.Sub(reg_n, v.ir.Imm32(imm32));
+ const IR::U32 address = P ? offset_address
+ : reg_n;
+
+ store_fn(v, address, reg_t);
+ if (W) {
+ v.ir.SetRegister(n, offset_address);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::thumb32_STRB_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC || n == t) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn);
+}
+
+bool TranslatorVisitor::thumb32_STRB_imm_2(Reg n, Reg t, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn);
+}
+
+bool TranslatorVisitor::thumb32_STRB_imm_3(Reg n, Reg t, Imm<12> imm12) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmByteFn);
+}
+
+bool TranslatorVisitor::thumb32_STRBT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat this as a normal STRB, given we don't support
+ // execution levels other than EL0 currently.
+ return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn);
+}
+
+bool TranslatorVisitor::thumb32_STRB(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) {
+ ir.WriteMemory8(offset_address, ir.LeastSignificantByte(data), IR::AccType::NORMAL);
+ });
+}
+
+bool TranslatorVisitor::thumb32_STRH_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC || n == t) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn);
+}
+
+bool TranslatorVisitor::thumb32_STRH_imm_2(Reg n, Reg t, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn);
+}
+
+bool TranslatorVisitor::thumb32_STRH_imm_3(Reg n, Reg t, Imm<12> imm12) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmHalfFn);
+}
+
+bool TranslatorVisitor::thumb32_STRHT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat this as a normal STRH, given we don't support
+ // execution levels other than EL0 currently.
+ return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn);
+}
+
+bool TranslatorVisitor::thumb32_STRH(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) {
+ ir.WriteMemory16(offset_address, ir.LeastSignificantHalf(data), IR::AccType::NORMAL);
+ });
+}
+
+bool TranslatorVisitor::thumb32_STR_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC || n == t) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn);
+}
+
+bool TranslatorVisitor::thumb32_STR_imm_2(Reg n, Reg t, Imm<8> imm8) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn);
+}
+
+bool TranslatorVisitor::thumb32_STR_imm_3(Reg n, Reg t, Imm<12> imm12) {
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+ return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmWordFn);
+}
+
+bool TranslatorVisitor::thumb32_STRT(Reg n, Reg t, Imm<8> imm8) {
+ // TODO: Add an unpredictable instruction path if this
+ // is executed in hypervisor mode if we ever support
+ // privileged execution levels.
+
+ if (n == Reg::PC) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ // Treat this as a normal STR, given we don't support
+ // execution levels other than EL0 currently.
+ return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn);
+}
+
+bool TranslatorVisitor::thumb32_STR_reg(Reg n, Reg t, Imm<2> imm2, Reg m) {
+ return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) {
+ ir.WriteMemory32(offset_address, data, IR::AccType::NORMAL);
+ });
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp
new file mode 100644
index 0000000000..a4e37f747a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp
@@ -0,0 +1,1489 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+
+namespace Dynarmic::A32 {
+
+template<typename FnT>
+bool TranslatorVisitor::EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg n, ExtReg m, const FnT& fn) {
+ if (!ir.current_location.FPSCR().Stride()) {
+ return UnpredictableInstruction();
+ }
+
+ // VFP register banks are 8 single-precision registers in size.
+ const size_t register_bank_size = sz ? 4 : 8;
+ size_t vector_length = ir.current_location.FPSCR().Len();
+ const size_t vector_stride = *ir.current_location.FPSCR().Stride();
+
+ // Unpredictable case
+ if (vector_stride * vector_length > register_bank_size) {
+ return UnpredictableInstruction();
+ }
+
+ // Scalar case
+ if (vector_length == 1) {
+ if (vector_stride != 1) {
+ return UnpredictableInstruction();
+ }
+
+ fn(d, n, m);
+ return true;
+ }
+
+ // The VFP register file is divided into banks each containing:
+ // * eight single-precision registers, or
+ // * four double-precision registers.
+ // VFP vector instructions access these registers in a circular manner.
+ const auto bank_increment = [register_bank_size](ExtReg reg, size_t stride) -> ExtReg {
+ const auto reg_number = static_cast<size_t>(reg);
+ const auto bank_index = reg_number % register_bank_size;
+ const auto bank_start = reg_number - bank_index;
+ const auto next_reg_number = bank_start + ((bank_index + stride) % register_bank_size);
+ return static_cast<ExtReg>(next_reg_number);
+ };
+
+ // The first and fifth banks in the register file are scalar banks.
+ // All the other banks are vector banks.
+ const auto belongs_to_scalar_bank = [](ExtReg reg) -> bool {
+ return (reg >= ExtReg::D0 && reg <= ExtReg::D3)
+ || (reg >= ExtReg::D16 && reg <= ExtReg::D19)
+ || (reg >= ExtReg::S0 && reg <= ExtReg::S7);
+ };
+
+ const bool d_is_scalar = belongs_to_scalar_bank(d);
+ const bool m_is_scalar = belongs_to_scalar_bank(m);
+
+ if (d_is_scalar) {
+ // If destination register is in a scalar bank, the operands and results are all scalars.
+ vector_length = 1;
+ }
+
+ for (size_t i = 0; i < vector_length; i++) {
+ fn(d, n, m);
+
+ d = bank_increment(d, vector_stride);
+ n = bank_increment(n, vector_stride);
+ if (!m_is_scalar) {
+ m = bank_increment(m, vector_stride);
+ }
+ }
+
+ return true;
+}
+
+template<typename FnT>
+bool TranslatorVisitor::EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg m, const FnT& fn) {
+ return EmitVfpVectorOperation(sz, d, ExtReg::S0, m, [fn](ExtReg d, ExtReg, ExtReg m) {
+ fn(d, m);
+ });
+}
+
+// VADD<c>.F64 <Dd>, <Dn>, <Dm>
+// VADD<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPAdd(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VSUB<c>.F64 <Dd>, <Dn>, <Dm>
+// VSUB<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPSub(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMUL<c>.F64 <Dd>, <Dn>, <Dm>
+// VMUL<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPMul(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMLA<c>.F64 <Dd>, <Dn>, <Dm>
+// VMLA<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPAdd(reg_d, ir.FPMul(reg_n, reg_m));
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMLS<c>.F64 <Dd>, <Dn>, <Dm>
+// VMLS<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPAdd(reg_d, ir.FPNeg(ir.FPMul(reg_n, reg_m)));
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VNMUL<c>.F64 <Dd>, <Dn>, <Dm>
+// VNMUL<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPNeg(ir.FPMul(reg_n, reg_m));
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VNMLA<c>.F64 <Dd>, <Dn>, <Dm>
+// VNMLA<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPAdd(ir.FPNeg(reg_d), ir.FPNeg(ir.FPMul(reg_n, reg_m)));
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VNMLS<c>.F64 <Dd>, <Dn>, <Dm>
+// VNMLS<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPAdd(ir.FPNeg(reg_d), ir.FPMul(reg_n, reg_m));
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VDIV<c>.F64 <Dd>, <Dn>, <Dm>
+// VDIV<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPDiv(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VFNMS<c>.F64 <Dd>, <Dn>, <Dm>
+// VFNMS<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPMulAdd(ir.FPNeg(reg_d), reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VFNMA<c>.F64 <Dd>, <Dn>, <Dm>
+// VFNMA<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPMulSub(ir.FPNeg(reg_d), reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VFMA<c>.F64 <Dd>, <Dn>, <Dm>
+// VFMA<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPMulAdd(reg_d, reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VFMS<c>.F64 <Dd>, <Dn>, <Dm>
+// VFMS<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto result = ir.FPMulSub(reg_d, reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VSEL<c>.F64 <Dd>, <Dn>, <Dm>
+// VSEL<c>.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ const Cond cond = concatenate(cc, Imm<1>{cc.Bit<0>() != cc.Bit<1>()}, Imm<1>{0}).ZeroExtend<Cond>();
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this, cond](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.ConditionalSelect(cond, reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMAXNM.F64 <Dd>, <Dn>, <Dm>
+// VMAXNM.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPMaxNumeric(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMINNM.F64 <Dd>, <Dn>, <Dm>
+// VMINNM.F32 <Sd>, <Sn>, <Sm>
+bool TranslatorVisitor::vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) {
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto n = ToExtReg(sz, Vn, N);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) {
+ const auto reg_n = ir.GetExtendedRegister(n);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPMinNumeric(reg_n, reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMOV<c>.32 <Dd[0]>, <Rt>
+bool TranslatorVisitor::vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(true, Vd, D);
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto reg_t = ir.GetRegister(t);
+ const auto result = ir.Pack2x32To1x64(reg_t, ir.MostSignificantWord(reg_d).result);
+
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VMOV<c>.32 <Rt>, <Dn[0]>
+bool TranslatorVisitor::vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n = ToExtReg(true, Vn, N);
+ const auto reg_n = ir.GetExtendedRegister(n);
+ ir.SetRegister(t, ir.LeastSignificantWord(reg_n));
+ return true;
+}
+
+// VMOV<c> <Sn>, <Rt>
+bool TranslatorVisitor::vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n = ToExtReg(false, Vn, N);
+ ir.SetExtendedRegister(n, ir.GetRegister(t));
+ return true;
+}
+
+// VMOV<c> <Rt>, <Sn>
+bool TranslatorVisitor::vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto n = ToExtReg(false, Vn, N);
+ ir.SetRegister(t, ir.GetExtendedRegister(n));
+ return true;
+}
+
+// VMOV<c> <Sm>, <Sm1>, <Rt>, <Rt2>
+bool TranslatorVisitor::vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ const auto m = ToExtReg(false, Vm, M);
+ if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.SetExtendedRegister(m, ir.GetRegister(t));
+ ir.SetExtendedRegister(m + 1, ir.GetRegister(t2));
+ return true;
+}
+
+// VMOV<c> <Rt>, <Rt2>, <Sm>, <Sm1>
+bool TranslatorVisitor::vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ const auto m = ToExtReg(false, Vm, M);
+ if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == t2) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ ir.SetRegister(t, ir.GetExtendedRegister(m));
+ ir.SetRegister(t2, ir.GetExtendedRegister(m + 1));
+ return true;
+}
+
+// VMOV<c> <Dm>, <Rt>, <Rt2>
+bool TranslatorVisitor::vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ const auto m = ToExtReg(true, Vm, M);
+ if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto value = ir.Pack2x32To1x64(ir.GetRegister(t), ir.GetRegister(t2));
+ ir.SetExtendedRegister(m, value);
+ return true;
+}
+
+// VMOV<c> <Rt>, <Rt2>, <Dm>
+bool TranslatorVisitor::vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) {
+ const auto m = ToExtReg(true, Vm, M);
+ if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) {
+ return UnpredictableInstruction();
+ }
+
+ if (t == t2) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto value = ir.GetExtendedRegister(m);
+ ir.SetRegister(t, ir.LeastSignificantWord(value));
+ ir.SetRegister(t2, ir.MostSignificantWord(value).result);
+ return true;
+}
+
+// VMOV<c>.32 <Dn[x]>, <Rt>
+bool TranslatorVisitor::vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = i.ZeroExtend();
+ const auto d = ToVector(false, Vd, D);
+
+ const auto reg_d = ir.GetVector(d);
+ const auto scalar = ir.GetRegister(t);
+ const auto result = ir.VectorSetElement(32, reg_d, index, scalar);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+// VMOV<c>.16 <Dn[x]>, <Rt>
+bool TranslatorVisitor::vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ const auto d = ToVector(false, Vd, D);
+
+ const auto reg_d = ir.GetVector(d);
+ const auto scalar = ir.LeastSignificantHalf(ir.GetRegister(t));
+ const auto result = ir.VectorSetElement(16, reg_d, index, scalar);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+// VMOV<c>.8 <Dn[x]>, <Rt>
+bool TranslatorVisitor::vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ const auto d = ToVector(false, Vd, D);
+
+ const auto reg_d = ir.GetVector(d);
+ const auto scalar = ir.LeastSignificantByte(ir.GetRegister(t));
+ const auto result = ir.VectorSetElement(8, reg_d, index, scalar);
+
+ ir.SetVector(d, result);
+ return true;
+}
+
+// VMOV<c>.32 <Rt>, <Dn[x]>
+bool TranslatorVisitor::vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = i.ZeroExtend();
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto result = ir.VectorGetElement(32, reg_n, index);
+
+ ir.SetRegister(t, result);
+ return true;
+}
+
+// VMOV<c>.{U16,S16} <Rt>, <Dn[x]>
+bool TranslatorVisitor::vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto scalar = ir.VectorGetElement(16, reg_n, index);
+ const auto result = U ? ir.ZeroExtendToWord(scalar) : ir.SignExtendToWord(scalar);
+
+ ir.SetRegister(t, result);
+ return true;
+}
+
+// VMOV<c>.{U8,S8} <Rt>, <Dn[x]>
+bool TranslatorVisitor::vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // TODO: v8 removes UPREDICTABLE for R13
+ return UnpredictableInstruction();
+ }
+
+ const size_t index = concatenate(i1, i2).ZeroExtend();
+ const auto n = ToVector(false, Vn, N);
+
+ const auto reg_n = ir.GetVector(n);
+ const auto scalar = ir.VectorGetElement(8, reg_n, index);
+ const auto result = U ? ir.ZeroExtendToWord(scalar) : ir.SignExtendToWord(scalar);
+
+ ir.SetRegister(t, result);
+ return true;
+}
+
+// VDUP<c>.{8,16,32} <Qd>, <Rt>
+// VDUP<c>.{8,16,32} <Dd>, <Rt>
+bool TranslatorVisitor::vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (Q && mcl::bit::get_bit<0>(Vd)) {
+ return UndefinedInstruction();
+ }
+ if (t == Reg::R15) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToVector(Q, Vd, D);
+ const size_t BE = concatenate(B, E).ZeroExtend();
+ const size_t esize = 32u >> BE;
+
+ if (BE == 0b11) {
+ return UndefinedInstruction();
+ }
+
+ const auto scalar = ir.LeastSignificant(esize, ir.GetRegister(t));
+ const auto result = ir.VectorBroadcast(esize, scalar);
+ ir.SetVector(d, result);
+ return true;
+}
+
+// VMOV<c>.F64 <Dd>, #<imm>
+// VMOV<c>.F32 <Sd>, #<imm>
+bool TranslatorVisitor::vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (ir.current_location.FPSCR().Stride() != 1 || ir.current_location.FPSCR().Len() != 1) {
+ return UndefinedInstruction();
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto imm8 = concatenate(imm4H, imm4L);
+
+ if (sz) {
+ const u64 sign = static_cast<u64>(imm8.Bit<7>());
+ const u64 exp = (imm8.Bit<6>() ? 0x3FC : 0x400) | imm8.Bits<4, 5, u64>();
+ const u64 fract = imm8.Bits<0, 3, u64>() << 48;
+ const u64 immediate = (sign << 63) | (exp << 52) | fract;
+ ir.SetExtendedRegister(d, ir.Imm64(immediate));
+ } else {
+ const u32 sign = static_cast<u32>(imm8.Bit<7>());
+ const u32 exp = (imm8.Bit<6>() ? 0x7C : 0x80) | imm8.Bits<4, 5>();
+ const u32 fract = imm8.Bits<0, 3>() << 19;
+ const u32 immediate = (sign << 31) | (exp << 23) | fract;
+ ir.SetExtendedRegister(d, ir.Imm32(immediate));
+ }
+ return true;
+}
+
+// VMOV<c>.F64 <Dd>, <Dm>
+// VMOV<c>.F32 <Sd>, <Sm>
+bool TranslatorVisitor::vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) {
+ ir.SetExtendedRegister(d, ir.GetExtendedRegister(m));
+ });
+}
+
+// VABS<c>.F64 <Dd>, <Dm>
+// VABS<c>.F32 <Sd>, <Sm>
+bool TranslatorVisitor::vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPAbs(reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VNEG<c>.F64 <Dd>, <Dm>
+// VNEG<c>.F32 <Sd>, <Sm>
+bool TranslatorVisitor::vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPNeg(reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VSQRT<c>.F64 <Dd>, <Dm>
+// VSQRT<c>.F32 <Sd>, <Sm>
+bool TranslatorVisitor::vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPSqrt(reg_m);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VCVTB<c>.f32.f16 <Dd>, <Dm>
+// VCVTB<c>.f64.f16 <Dd>, <Dm>
+// VCVTB<c>.f16.f32 <Dd>, <Dm>
+// VCVTB<c>.f16.f64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const bool convert_from_half = !op;
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+ if (convert_from_half) {
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(false, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.LeastSignificantHalf(ir.GetExtendedRegister(m));
+ const auto result = sz ? IR::U32U64{ir.FPHalfToDouble(reg_m, rounding_mode)} : IR::U32U64{ir.FPHalfToSingle(reg_m, rounding_mode)};
+ ir.SetExtendedRegister(d, result);
+ });
+ } else {
+ const auto d = ToExtReg(false, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = sz ? ir.FPDoubleToHalf(reg_m, rounding_mode) : ir.FPSingleToHalf(reg_m, rounding_mode);
+ ir.SetExtendedRegister(d, ir.Or(ir.And(ir.GetExtendedRegister(d), ir.Imm32(0xFFFF0000)), ir.ZeroExtendToWord(result)));
+ });
+ }
+}
+
+// VCVTT<c>.f32.f16 <Dd>, <Dm>
+// VCVTT<c>.f64.f16 <Dd>, <Dm>
+// VCVTT<c>.f16.f32 <Dd>, <Dm>
+// VCVTT<c>.f16.f64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const bool convert_from_half = !op;
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+ if (convert_from_half) {
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(false, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.LeastSignificantHalf(ir.LogicalShiftRight(ir.GetExtendedRegister(m), ir.Imm8(16)));
+ const auto result = sz ? IR::U32U64{ir.FPHalfToDouble(reg_m, rounding_mode)} : IR::U32U64{ir.FPHalfToSingle(reg_m, rounding_mode)};
+ ir.SetExtendedRegister(d, result);
+ });
+ } else {
+ const auto d = ToExtReg(false, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = sz ? ir.FPDoubleToHalf(reg_m, rounding_mode) : ir.FPSingleToHalf(reg_m, rounding_mode);
+ ir.SetExtendedRegister(d, ir.Or(ir.And(ir.GetExtendedRegister(d), ir.Imm32(0x0000FFFF)), ir.LogicalShiftLeft(ir.ZeroExtendToWord(result), ir.Imm8(16))));
+ });
+ }
+}
+
+// VCMP{E}.F32 <Sd>, <Sm>
+// VCMP{E}.F64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto exc_on_qnan = E;
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto nzcv = ir.FPCompare(reg_d, reg_m, exc_on_qnan);
+
+ ir.SetFpscrNZCV(nzcv);
+ return true;
+}
+
+// VCMP{E}.F32 <Sd>, #0.0
+// VCMP{E}.F64 <Dd>, #0.0
+bool TranslatorVisitor::vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto exc_on_qnan = E;
+ const auto reg_d = ir.GetExtendedRegister(d);
+
+ if (sz) {
+ const auto nzcv = ir.FPCompare(reg_d, ir.Imm64(0), exc_on_qnan);
+ ir.SetFpscrNZCV(nzcv);
+ } else {
+ const auto nzcv = ir.FPCompare(reg_d, ir.Imm32(0), exc_on_qnan);
+ ir.SetFpscrNZCV(nzcv);
+ }
+
+ return true;
+}
+
+// VRINTR.{F16,F32} <Sd>, <Sm>
+// VRINTR.F64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+
+ const auto result = ir.FPRoundInt(reg_m, rounding_mode, false);
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VRINTZ.{F16,F32} <Sd>, <Sm>
+// VRINTZ.F64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto rounding_mode = FP::RoundingMode::TowardsZero;
+
+ const auto result = ir.FPRoundInt(reg_m, rounding_mode, false);
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VRINTX.{F16,F32} <Sd>, <Sm>
+// VRINTX.F64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+
+ const auto result = ir.FPRoundInt(reg_m, rounding_mode, true);
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VCVT<c>.F64.F32 <Dd>, <Sm>
+// VCVT<c>.F32.F64 <Sd>, <Dm>
+bool TranslatorVisitor::vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(!sz, Vd, D); // Destination is of opposite size to source
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+
+ if (sz) {
+ const auto result = ir.FPDoubleToSingle(reg_m, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ } else {
+ const auto result = ir.FPSingleToDouble(reg_m, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ }
+
+ return true;
+}
+
+// VCVT.F32.{S32,U32} <Sd>, <Sm>
+// VCVT.F64.{S32,U32} <Sd>, <Dm>
+bool TranslatorVisitor::vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(false, Vm, M);
+ const auto rounding_mode = ir.current_location.FPSCR().RMode();
+ const auto reg_m = ir.GetExtendedRegister(m);
+
+ if (sz) {
+ const auto result = is_signed
+ ? ir.FPSignedFixedToDouble(reg_m, 0, rounding_mode)
+ : ir.FPUnsignedFixedToDouble(reg_m, 0, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ } else {
+ const auto result = is_signed
+ ? ir.FPSignedFixedToSingle(reg_m, 0, rounding_mode)
+ : ir.FPUnsignedFixedToSingle(reg_m, 0, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ }
+
+ return true;
+}
+
+// VCVT.F32.{S16,U16,S32,U32} <Sdm>, <Sdm>
+// VCVT.F64.{S16,U16,S32,U32} <Ddm>, <Ddm>
+bool TranslatorVisitor::vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const size_t size = sx ? 32 : 16;
+ const size_t fbits = size - concatenate(imm4, i).ZeroExtend();
+
+ if (fbits > size) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven;
+ const auto reg_d = ir.GetExtendedRegister(d);
+ const auto source = ir.LeastSignificant(size, reg_d);
+
+ if (sz) {
+ const auto result = U ? ir.FPUnsignedFixedToDouble(source, fbits, rounding_mode)
+ : ir.FPSignedFixedToDouble(source, fbits, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ } else {
+ const auto result = U ? ir.FPUnsignedFixedToSingle(source, fbits, rounding_mode)
+ : ir.FPSignedFixedToSingle(source, fbits, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ }
+
+ return true;
+}
+
+// VCVT{,R}.U32.F32 <Sd>, <Sm>
+// VCVT{,R}.U32.F64 <Sd>, <Dm>
+bool TranslatorVisitor::vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const ExtReg d = ToExtReg(false, Vd, D);
+ const ExtReg m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPToFixedU32(reg_m, 0, round_towards_zero ? FP::RoundingMode::TowardsZero : ir.current_location.FPSCR().RMode());
+
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VCVT{,R}.S32.F32 <Sd>, <Sm>
+// VCVT{,R}.S32.F64 <Sd>, <Dm>
+bool TranslatorVisitor::vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const auto d = ToExtReg(false, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPToFixedS32(reg_m, 0, round_towards_zero ? FP::RoundingMode::TowardsZero : ir.current_location.FPSCR().RMode());
+
+ ir.SetExtendedRegister(d, result);
+ return true;
+}
+
+// VCVT.{S16,U16,S32,U32}.F32 <Sdm>, <Sdm>
+// VCVT.{S16,U16,S32,U32}.F64 <Ddm>, <Ddm>
+bool TranslatorVisitor::vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const size_t size = sx ? 32 : 16;
+ const size_t fbits = size - concatenate(imm4, i).ZeroExtend();
+
+ if (fbits > size) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto rounding_mode = FP::RoundingMode::TowardsZero;
+ const auto reg_d = ir.GetExtendedRegister(d);
+
+ const auto result = [&]() -> IR::U16U32U64 {
+ if (sx) {
+ return U ? ir.FPToFixedU32(reg_d, fbits, rounding_mode)
+ : ir.FPToFixedS32(reg_d, fbits, rounding_mode);
+ } else {
+ return U ? ir.FPToFixedU16(reg_d, fbits, rounding_mode)
+ : ir.FPToFixedS16(reg_d, fbits, rounding_mode);
+ }
+ }();
+
+ if (sz) {
+ ir.SetExtendedRegister(d, U ? ir.ZeroExtendToLong(result) : ir.SignExtendToLong(result));
+ } else {
+ ir.SetExtendedRegister(d, U ? ir.ZeroExtendToWord(result) : ir.SignExtendToWord(result));
+ }
+ return true;
+}
+
+// VRINT{A,N,P,M}.F32 <Sd>, <Sm>
+// VRINT{A,N,P,M}.F64 <Dd>, <Dm>
+bool TranslatorVisitor::vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm) {
+ const std::array rm_lookup{
+ FP::RoundingMode::ToNearest_TieAwayFromZero,
+ FP::RoundingMode::ToNearest_TieEven,
+ FP::RoundingMode::TowardsPlusInfinity,
+ FP::RoundingMode::TowardsMinusInfinity,
+ };
+ const FP::RoundingMode rounding_mode = rm_lookup[rm];
+
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, rounding_mode](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = ir.FPRoundInt(reg_m, rounding_mode, false);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VCVT{A,N,P,M}.F32 <Sd>, <Sm>
+// VCVT{A,N,P,M}.F64 <Sd>, <Dm>
+bool TranslatorVisitor::vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm) {
+ const std::array rm_lookup{
+ FP::RoundingMode::ToNearest_TieAwayFromZero,
+ FP::RoundingMode::ToNearest_TieEven,
+ FP::RoundingMode::TowardsPlusInfinity,
+ FP::RoundingMode::TowardsMinusInfinity,
+ };
+ const FP::RoundingMode rounding_mode = rm_lookup[rm];
+ const bool unsigned_ = !U;
+
+ const auto d = ToExtReg(false, Vd, D);
+ const auto m = ToExtReg(sz, Vm, M);
+
+ return EmitVfpVectorOperation(sz, d, m, [this, rounding_mode, unsigned_](ExtReg d, ExtReg m) {
+ const auto reg_m = ir.GetExtendedRegister(m);
+ const auto result = unsigned_ ? ir.FPToFixedU32(reg_m, 0, rounding_mode) : ir.FPToFixedS32(reg_m, 0, rounding_mode);
+ ir.SetExtendedRegister(d, result);
+ });
+}
+
+// VMSR FPSCR, <Rt>
+bool TranslatorVisitor::vfp_VMSR(Cond cond, Reg t) {
+ if (t == Reg::PC) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ // TODO: Replace this with a local cache.
+ ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT());
+
+ ir.UpdateUpperLocationDescriptor();
+ ir.SetFpscr(ir.GetRegister(t));
+ ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4));
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ return false;
+}
+
+// VMRS <Rt>, FPSCR
+bool TranslatorVisitor::vfp_VMRS(Cond cond, Reg t) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ if (t == Reg::R15) {
+ // This encodes ASPR_nzcv access
+ const auto nzcv = ir.GetFpscrNZCV();
+ ir.SetCpsrNZCVRaw(nzcv);
+ } else {
+ ir.SetRegister(t, ir.GetFpscr());
+ }
+
+ return true;
+}
+
+// VPOP.{F32,F64} <list>
+bool TranslatorVisitor::vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) {
+ const ExtReg d = ToExtReg(sz, Vd, D);
+ const size_t regs = sz ? imm8.ZeroExtend() >> 1 : imm8.ZeroExtend();
+
+ if (regs == 0 || RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (sz && regs > 16) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = ir.GetRegister(Reg::SP);
+ ir.SetRegister(Reg::SP, ir.Add(address, ir.Imm32(imm32)));
+
+ for (size_t i = 0; i < regs; ++i) {
+ if (sz) {
+ auto lo = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ auto hi = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ if (ir.current_location.EFlag()) {
+ std::swap(lo, hi);
+ }
+ ir.SetExtendedRegister(d + i, ir.Pack2x32To1x64(lo, hi));
+ } else {
+ const auto res = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ ir.SetExtendedRegister(d + i, res);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ return true;
+}
+
+// VPUSH.{F32,F64} <list>
+bool TranslatorVisitor::vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) {
+ const ExtReg d = ToExtReg(sz, Vd, D);
+ const size_t regs = sz ? imm8.ZeroExtend() >> 1 : imm8.ZeroExtend();
+
+ if (regs == 0 || RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (sz && regs > 16) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = ir.Sub(ir.GetRegister(Reg::SP), ir.Imm32(imm32));
+ ir.SetRegister(Reg::SP, address);
+
+ for (size_t i = 0; i < regs; ++i) {
+ if (sz) {
+ const auto reg_d = ir.GetExtendedRegister(d + i);
+ auto lo = ir.LeastSignificantWord(reg_d);
+ auto hi = ir.MostSignificantWord(reg_d).result;
+ if (ir.current_location.EFlag())
+ std::swap(lo, hi);
+ ir.WriteMemory32(address, lo, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ ir.WriteMemory32(address, hi, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ } else {
+ ir.WriteMemory32(address, ir.GetExtendedRegister(d + i), IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+ }
+
+ return true;
+}
+
+// VLDR<c> <Dd>, [<Rn>{, #+/-<imm>}]
+// VLDR<c> <Sd>, [<Rn>{, #+/-<imm>}]
+bool TranslatorVisitor::vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto base = n == Reg::PC ? ir.Imm32(ir.AlignPC(4)) : ir.GetRegister(n);
+ const auto address = U ? ir.Add(base, ir.Imm32(imm32)) : ir.Sub(base, ir.Imm32(imm32));
+
+ if (sz) {
+ auto lo = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ auto hi = ir.ReadMemory32(ir.Add(address, ir.Imm32(4)), IR::AccType::ATOMIC);
+ if (ir.current_location.EFlag()) {
+ std::swap(lo, hi);
+ }
+ ir.SetExtendedRegister(d, ir.Pack2x32To1x64(lo, hi));
+ } else {
+ ir.SetExtendedRegister(d, ir.ReadMemory32(address, IR::AccType::ATOMIC));
+ }
+
+ return true;
+}
+
+// VSTR<c> <Dd>, [<Rn>{, #+/-<imm>}]
+// VSTR<c> <Sd>, [<Rn>{, #+/-<imm>}]
+bool TranslatorVisitor::vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) {
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ const auto d = ToExtReg(sz, Vd, D);
+ const auto base = n == Reg::PC ? ir.Imm32(ir.AlignPC(4)) : ir.GetRegister(n);
+ const auto address = U ? ir.Add(base, ir.Imm32(imm32)) : ir.Sub(base, ir.Imm32(imm32));
+ if (sz) {
+ const auto reg_d = ir.GetExtendedRegister(d);
+ auto lo = ir.LeastSignificantWord(reg_d);
+ auto hi = ir.MostSignificantWord(reg_d).result;
+ if (ir.current_location.EFlag()) {
+ std::swap(lo, hi);
+ }
+ ir.WriteMemory32(address, lo, IR::AccType::ATOMIC);
+ ir.WriteMemory32(ir.Add(address, ir.Imm32(4)), hi, IR::AccType::ATOMIC);
+ } else {
+ ir.WriteMemory32(address, ir.GetExtendedRegister(d), IR::AccType::ATOMIC);
+ }
+
+ return true;
+}
+
+// VSTM{mode}<c> <Rn>{!}, <list of double registers>
+bool TranslatorVisitor::vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ if (!p && !u && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p == u && w) {
+ return arm_UDF();
+ }
+
+ if (n == Reg::PC && w) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(true, Vd, D);
+ const size_t regs = imm8.ZeroExtend() / 2;
+
+ if (regs == 0 || regs > 16 || A32::RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32)));
+ if (w) {
+ ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address);
+ }
+ for (size_t i = 0; i < regs; i++) {
+ const auto value = ir.GetExtendedRegister(d + i);
+ auto word1 = ir.LeastSignificantWord(value);
+ auto word2 = ir.MostSignificantWord(value).result;
+
+ if (ir.current_location.EFlag()) {
+ std::swap(word1, word2);
+ }
+
+ ir.WriteMemory32(address, word1, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ ir.WriteMemory32(address, word2, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+
+ return true;
+}
+
+// VSTM{mode}<c> <Rn>{!}, <list of single registers>
+bool TranslatorVisitor::vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ if (!p && !u && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p == u && w) {
+ return arm_UDF();
+ }
+
+ if (n == Reg::PC && w) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(false, Vd, D);
+ const size_t regs = imm8.ZeroExtend();
+
+ if (regs == 0 || A32::RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32)));
+ if (w) {
+ ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address);
+ }
+ for (size_t i = 0; i < regs; i++) {
+ const auto word = ir.GetExtendedRegister(d + i);
+ ir.WriteMemory32(address, word, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ }
+
+ return true;
+}
+
+// VLDM{mode}<c> <Rn>{!}, <list of double registers>
+bool TranslatorVisitor::vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ if (!p && !u && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p == u && w) {
+ return arm_UDF();
+ }
+
+ if (n == Reg::PC && (w || ir.current_location.TFlag())) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(true, Vd, D);
+ const size_t regs = imm8.ZeroExtend() / 2;
+
+ if (regs == 0 || regs > 16 || A32::RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32)));
+ if (w) {
+ ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address);
+ }
+ for (size_t i = 0; i < regs; i++) {
+ auto word1 = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ auto word2 = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+
+ if (ir.current_location.EFlag()) {
+ std::swap(word1, word2);
+ }
+
+ ir.SetExtendedRegister(d + i, ir.Pack2x32To1x64(word1, word2));
+ }
+
+ return true;
+}
+
+// VLDM{mode}<c> <Rn>{!}, <list of single registers>
+bool TranslatorVisitor::vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) {
+ if (!p && !u && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p && !w) {
+ ASSERT_MSG(false, "Decode error");
+ }
+
+ if (p == u && w) {
+ return arm_UDF();
+ }
+
+ if (n == Reg::PC && (w || ir.current_location.TFlag())) {
+ return UnpredictableInstruction();
+ }
+
+ const auto d = ToExtReg(false, Vd, D);
+ const size_t regs = imm8.ZeroExtend();
+
+ if (regs == 0 || A32::RegNumber(d) + regs > 32) {
+ return UnpredictableInstruction();
+ }
+
+ if (!VFPConditionPassed(cond)) {
+ return true;
+ }
+
+ const u32 imm32 = imm8.ZeroExtend() << 2;
+ auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32)));
+ if (w) {
+ ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address);
+ }
+ for (size_t i = 0; i < regs; i++) {
+ const auto word = ir.ReadMemory32(address, IR::AccType::ATOMIC);
+ address = ir.Add(address, ir.Imm32(4));
+ ir.SetExtendedRegister(d + i, word);
+ }
+
+ return true;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp
new file mode 100644
index 0000000000..24b5797de8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp
@@ -0,0 +1,115 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/frontend/A32/decoder/arm.h"
+#include "dynarmic/frontend/A32/decoder/asimd.h"
+#include "dynarmic/frontend/A32/decoder/vfp.h"
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+#include "dynarmic/frontend/A32/translate/conditional_state.h"
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/frontend/A32/translate/translate_callbacks.h"
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/ir/basic_block.h"
+
+namespace Dynarmic::A32 {
+
+IR::Block TranslateArm(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) {
+ const bool single_step = descriptor.SingleStepping();
+
+ IR::Block block{descriptor};
+ TranslatorVisitor visitor{block, descriptor, options};
+
+ bool should_continue = true;
+ do {
+ const u32 arm_pc = visitor.ir.current_location.PC();
+ u64 ticks_for_instruction = 1;
+
+ if (!tcb->PreCodeReadHook(false, arm_pc, visitor.ir)) {
+ should_continue = false;
+ break;
+ }
+
+ if (const auto arm_instruction = tcb->MemoryReadCode(arm_pc)) {
+ visitor.current_instruction_size = 4;
+
+ tcb->PreCodeTranslationHook(false, arm_pc, visitor.ir);
+ ticks_for_instruction = tcb->GetTicksForCode(false, arm_pc, *arm_instruction);
+
+ if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(*arm_instruction)) {
+ should_continue = vfp_decoder->get().call(visitor, *arm_instruction);
+ } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(*arm_instruction)) {
+ should_continue = asimd_decoder->get().call(visitor, *arm_instruction);
+ } else if (const auto decoder = DecodeArm<TranslatorVisitor>(*arm_instruction)) {
+ should_continue = decoder->get().call(visitor, *arm_instruction);
+ } else {
+ should_continue = visitor.arm_UDF();
+ }
+ } else {
+ visitor.current_instruction_size = 4;
+
+ should_continue = visitor.RaiseException(Exception::NoExecuteFault);
+ }
+
+ if (visitor.cond_state == ConditionalState::Break) {
+ break;
+ }
+
+ visitor.ir.current_location = visitor.ir.current_location.AdvancePC(4);
+ block.CycleCount() += ticks_for_instruction;
+ } while (should_continue && CondCanContinue(visitor.cond_state, visitor.ir) && !single_step);
+
+ if (visitor.cond_state == ConditionalState::Translating || visitor.cond_state == ConditionalState::Trailing || single_step) {
+ if (should_continue) {
+ if (single_step) {
+ visitor.ir.SetTerm(IR::Term::LinkBlock{visitor.ir.current_location});
+ } else {
+ visitor.ir.SetTerm(IR::Term::LinkBlockFast{visitor.ir.current_location});
+ }
+ }
+ }
+
+ ASSERT_MSG(block.HasTerminal(), "Terminal has not been set");
+
+ block.SetEndLocation(visitor.ir.current_location);
+
+ return block;
+}
+
+bool TranslateSingleArmInstruction(IR::Block& block, LocationDescriptor descriptor, u32 arm_instruction) {
+ TranslatorVisitor visitor{block, descriptor, {}};
+
+ bool should_continue = true;
+
+ // TODO: Proper cond handling
+
+ visitor.current_instruction_size = 4;
+
+ const u64 ticks_for_instruction = 1;
+
+ if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(arm_instruction)) {
+ should_continue = vfp_decoder->get().call(visitor, arm_instruction);
+ } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(arm_instruction)) {
+ should_continue = asimd_decoder->get().call(visitor, arm_instruction);
+ } else if (const auto decoder = DecodeArm<TranslatorVisitor>(arm_instruction)) {
+ should_continue = decoder->get().call(visitor, arm_instruction);
+ } else {
+ should_continue = visitor.arm_UDF();
+ }
+
+ // TODO: Feedback resulting cond status to caller somehow.
+
+ visitor.ir.current_location = visitor.ir.current_location.AdvancePC(4);
+ block.CycleCount() += ticks_for_instruction;
+
+ block.SetEndLocation(visitor.ir.current_location);
+
+ return should_continue;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h
new file mode 100644
index 0000000000..e0ad48ea5c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h
@@ -0,0 +1,35 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2021 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+#pragma once
+
+#include <cstdint>
+#include <optional>
+
+namespace Dynarmic::A32 {
+
+using VAddr = std::uint32_t;
+
+class IREmitter;
+
+struct TranslateCallbacks {
+ // All reads through this callback are 4-byte aligned.
+ // Memory must be interpreted as little endian.
+ virtual std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) = 0;
+
+ // This function is called before the instruction at pc is read.
+ // IR code can be emitted by the callee prior to instruction handling.
+ // By returning false the callee precludes the translation of the instruction;
+ // in such case the callee is responsible for setting the terminal.
+ virtual bool PreCodeReadHook(bool is_thumb, VAddr pc, A32::IREmitter& ir) = 0;
+
+ // This function is called before the instruction at pc is interpreted.
+ // IR code can be emitted by the callee prior to translation of the instruction.
+ virtual void PreCodeTranslationHook(bool is_thumb, VAddr pc, A32::IREmitter& ir) = 0;
+
+ // How many ticks should this instruction take to execute?
+ virtual std::uint64_t GetTicksForCode(bool is_thumb, VAddr vaddr, std::uint32_t instruction) = 0;
+};
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp
new file mode 100644
index 0000000000..d61d8ccbff
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp
@@ -0,0 +1,227 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <tuple>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bit/swap.hpp>
+
+#include "dynarmic/frontend/A32/a32_ir_emitter.h"
+#include "dynarmic/frontend/A32/a32_location_descriptor.h"
+#include "dynarmic/frontend/A32/decoder/asimd.h"
+#include "dynarmic/frontend/A32/decoder/thumb16.h"
+#include "dynarmic/frontend/A32/decoder/thumb32.h"
+#include "dynarmic/frontend/A32/decoder/vfp.h"
+#include "dynarmic/frontend/A32/translate/a32_translate.h"
+#include "dynarmic/frontend/A32/translate/conditional_state.h"
+#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h"
+#include "dynarmic/frontend/A32/translate/translate_callbacks.h"
+#include "dynarmic/frontend/imm.h"
+#include "dynarmic/interface/A32/config.h"
+
+namespace Dynarmic::A32 {
+namespace {
+
+enum class ThumbInstSize {
+ Thumb16,
+ Thumb32
+};
+
+bool IsThumb16(u16 first_part) {
+ return first_part < 0xE800;
+}
+
+bool IsUnconditionalInstruction(bool is_thumb_16, u32 instruction) {
+ if (!is_thumb_16)
+ return false;
+ if ((instruction & 0xFF00) == 0b10111110'00000000) // BKPT
+ return true;
+ if ((instruction & 0xFFC0) == 0b10111010'10000000) // HLT
+ return true;
+ return false;
+}
+
+std::optional<std::tuple<u32, ThumbInstSize>> ReadThumbInstruction(u32 arm_pc, TranslateCallbacks* tcb) {
+ u32 instruction;
+
+ const std::optional<u32> first_part = tcb->MemoryReadCode(arm_pc & 0xFFFFFFFC);
+ if (!first_part)
+ return std::nullopt;
+
+ if ((arm_pc & 0x2) != 0) {
+ instruction = *first_part >> 16;
+ } else {
+ instruction = *first_part & 0xFFFF;
+ }
+
+ if (IsThumb16(static_cast<u16>(instruction))) {
+ // 16-bit thumb instruction
+ return std::make_tuple(instruction, ThumbInstSize::Thumb16);
+ }
+
+ // 32-bit thumb instruction
+ // These always start with 0b11101, 0b11110 or 0b11111.
+
+ instruction <<= 16;
+
+ const std::optional<u32> second_part = tcb->MemoryReadCode((arm_pc + 2) & 0xFFFFFFFC);
+ if (!second_part)
+ return std::nullopt;
+
+ if (((arm_pc + 2) & 0x2) != 0) {
+ instruction |= *second_part >> 16;
+ } else {
+ instruction |= *second_part & 0xFFFF;
+ }
+
+ return std::make_tuple(instruction, ThumbInstSize::Thumb32);
+}
+
+// Convert from thumb ASIMD format to ARM ASIMD format.
+u32 ConvertASIMDInstruction(u32 thumb_instruction) {
+ if ((thumb_instruction & 0xEF000000) == 0xEF000000) {
+ const bool U = mcl::bit::get_bit<28>(thumb_instruction);
+ return 0xF2000000 | (U << 24) | (thumb_instruction & 0x00FFFFFF);
+ }
+
+ if ((thumb_instruction & 0xFF000000) == 0xF9000000) {
+ return 0xF4000000 | (thumb_instruction & 0x00FFFFFF);
+ }
+
+ return 0xF7F0A000; // UDF
+}
+
+bool MaybeVFPOrASIMDInstruction(u32 thumb_instruction) {
+ return (thumb_instruction & 0xEC000000) == 0xEC000000 || (thumb_instruction & 0xFF100000) == 0xF9000000;
+}
+
+} // namespace
+
+IR::Block TranslateThumb(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) {
+ const bool single_step = descriptor.SingleStepping();
+
+ IR::Block block{descriptor};
+ TranslatorVisitor visitor{block, descriptor, options};
+
+ bool should_continue = true;
+ do {
+ const u32 arm_pc = visitor.ir.current_location.PC();
+ u64 ticks_for_instruction = 1;
+
+ if (!tcb->PreCodeReadHook(true, arm_pc, visitor.ir)) {
+ should_continue = false;
+ break;
+ }
+
+ if (const auto maybe_instruction = ReadThumbInstruction(arm_pc, tcb)) {
+ const auto [thumb_instruction, inst_size] = *maybe_instruction;
+ const bool is_thumb_16 = inst_size == ThumbInstSize::Thumb16;
+ visitor.current_instruction_size = is_thumb_16 ? 2 : 4;
+
+ tcb->PreCodeTranslationHook(true, arm_pc, visitor.ir);
+ ticks_for_instruction = tcb->GetTicksForCode(true, arm_pc, thumb_instruction);
+
+ if (IsUnconditionalInstruction(is_thumb_16, thumb_instruction) || visitor.ThumbConditionPassed()) {
+ if (is_thumb_16) {
+ if (const auto decoder = DecodeThumb16<TranslatorVisitor>(static_cast<u16>(thumb_instruction))) {
+ should_continue = decoder->get().call(visitor, static_cast<u16>(thumb_instruction));
+ } else {
+ should_continue = visitor.thumb16_UDF();
+ }
+ } else {
+ if (MaybeVFPOrASIMDInstruction(thumb_instruction)) {
+ if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = vfp_decoder->get().call(visitor, thumb_instruction);
+ } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(ConvertASIMDInstruction(thumb_instruction))) {
+ should_continue = asimd_decoder->get().call(visitor, ConvertASIMDInstruction(thumb_instruction));
+ } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = decoder->get().call(visitor, thumb_instruction);
+ } else {
+ should_continue = visitor.thumb32_UDF();
+ }
+ } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = decoder->get().call(visitor, thumb_instruction);
+ } else {
+ should_continue = visitor.thumb32_UDF();
+ }
+ }
+ }
+ } else {
+ visitor.current_instruction_size = 2;
+
+ should_continue = visitor.RaiseException(Exception::NoExecuteFault);
+ }
+
+ if (visitor.cond_state == ConditionalState::Break) {
+ break;
+ }
+
+ visitor.ir.current_location = visitor.ir.current_location.AdvancePC(static_cast<int>(visitor.current_instruction_size)).AdvanceIT();
+ block.CycleCount() += ticks_for_instruction;
+ } while (should_continue && CondCanContinue(visitor.cond_state, visitor.ir) && !single_step);
+
+ if (visitor.cond_state == ConditionalState::Translating || visitor.cond_state == ConditionalState::Trailing || single_step) {
+ if (should_continue) {
+ if (single_step) {
+ visitor.ir.SetTerm(IR::Term::LinkBlock{visitor.ir.current_location});
+ } else {
+ visitor.ir.SetTerm(IR::Term::LinkBlockFast{visitor.ir.current_location});
+ }
+ }
+ }
+
+ ASSERT_MSG(block.HasTerminal(), "Terminal has not been set");
+
+ block.SetEndLocation(visitor.ir.current_location);
+
+ return block;
+}
+
+bool TranslateSingleThumbInstruction(IR::Block& block, LocationDescriptor descriptor, u32 thumb_instruction) {
+ TranslatorVisitor visitor{block, descriptor, {}};
+
+ bool should_continue = true;
+
+ const bool is_thumb_16 = IsThumb16(static_cast<u16>(thumb_instruction));
+ visitor.current_instruction_size = is_thumb_16 ? 2 : 4;
+
+ const u64 ticks_for_instruction = 1;
+
+ if (is_thumb_16) {
+ if (const auto decoder = DecodeThumb16<TranslatorVisitor>(static_cast<u16>(thumb_instruction))) {
+ should_continue = decoder->get().call(visitor, static_cast<u16>(thumb_instruction));
+ } else {
+ should_continue = visitor.thumb16_UDF();
+ }
+ } else {
+ thumb_instruction = mcl::bit::swap_halves_32(thumb_instruction);
+ if (MaybeVFPOrASIMDInstruction(thumb_instruction)) {
+ if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = vfp_decoder->get().call(visitor, thumb_instruction);
+ } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(ConvertASIMDInstruction(thumb_instruction))) {
+ should_continue = asimd_decoder->get().call(visitor, ConvertASIMDInstruction(thumb_instruction));
+ } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = decoder->get().call(visitor, thumb_instruction);
+ } else {
+ should_continue = visitor.thumb32_UDF();
+ }
+ } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) {
+ should_continue = decoder->get().call(visitor, thumb_instruction);
+ } else {
+ should_continue = visitor.thumb32_UDF();
+ }
+ }
+
+ const s32 advance_pc = is_thumb_16 ? 2 : 4;
+ visitor.ir.current_location = visitor.ir.current_location.AdvancePC(advance_pc);
+ block.CycleCount() += ticks_for_instruction;
+
+ block.SetEndLocation(visitor.ir.current_location);
+
+ return should_continue;
+}
+
+} // namespace Dynarmic::A32
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp
new file mode 100644
index 0000000000..3f5a70bdc0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp
@@ -0,0 +1,265 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/a64_ir_emitter.h"
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::A64 {
+
+using Opcode = IR::Opcode;
+
+u64 IREmitter::PC() const {
+ return current_location->PC();
+}
+
+u64 IREmitter::AlignPC(size_t alignment) const {
+ const u64 pc = PC();
+ return static_cast<u64>(pc - pc % alignment);
+}
+
+void IREmitter::SetCheckBit(const IR::U1& value) {
+ Inst(Opcode::A64SetCheckBit, value);
+}
+
+IR::U1 IREmitter::GetCFlag() {
+ return Inst<IR::U1>(Opcode::A64GetCFlag);
+}
+
+IR::U32 IREmitter::GetNZCVRaw() {
+ return Inst<IR::U32>(Opcode::A64GetNZCVRaw);
+}
+
+void IREmitter::SetNZCVRaw(IR::U32 value) {
+ Inst(Opcode::A64SetNZCVRaw, value);
+}
+
+void IREmitter::SetNZCV(const IR::NZCV& nzcv) {
+ Inst(Opcode::A64SetNZCV, nzcv);
+}
+
+void IREmitter::CallSupervisor(u32 imm) {
+ Inst(Opcode::A64CallSupervisor, Imm32(imm));
+}
+
+void IREmitter::ExceptionRaised(Exception exception) {
+ Inst(Opcode::A64ExceptionRaised, Imm64(PC()), Imm64(static_cast<u64>(exception)));
+}
+
+void IREmitter::DataCacheOperationRaised(DataCacheOperation op, const IR::U64& value) {
+ Inst(Opcode::A64DataCacheOperationRaised, ImmCurrentLocationDescriptor(), Imm64(static_cast<u64>(op)), value);
+}
+
+void IREmitter::InstructionCacheOperationRaised(InstructionCacheOperation op, const IR::U64& value) {
+ Inst(Opcode::A64InstructionCacheOperationRaised, Imm64(static_cast<u64>(op)), value);
+}
+
+void IREmitter::DataSynchronizationBarrier() {
+ Inst(Opcode::A64DataSynchronizationBarrier);
+}
+
+void IREmitter::DataMemoryBarrier() {
+ Inst(Opcode::A64DataMemoryBarrier);
+}
+
+void IREmitter::InstructionSynchronizationBarrier() {
+ Inst(Opcode::A64InstructionSynchronizationBarrier);
+}
+
+IR::U32 IREmitter::GetCNTFRQ() {
+ return Inst<IR::U32>(Opcode::A64GetCNTFRQ);
+}
+
+IR::U64 IREmitter::GetCNTPCT() {
+ return Inst<IR::U64>(Opcode::A64GetCNTPCT);
+}
+
+IR::U32 IREmitter::GetCTR() {
+ return Inst<IR::U32>(Opcode::A64GetCTR);
+}
+
+IR::U32 IREmitter::GetDCZID() {
+ return Inst<IR::U32>(Opcode::A64GetDCZID);
+}
+
+IR::U64 IREmitter::GetTPIDR() {
+ return Inst<IR::U64>(Opcode::A64GetTPIDR);
+}
+
+void IREmitter::SetTPIDR(const IR::U64& value) {
+ Inst(Opcode::A64SetTPIDR, value);
+}
+
+IR::U64 IREmitter::GetTPIDRRO() {
+ return Inst<IR::U64>(Opcode::A64GetTPIDRRO);
+}
+
+void IREmitter::ClearExclusive() {
+ Inst(Opcode::A64ClearExclusive);
+}
+
+IR::U8 IREmitter::ReadMemory8(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U8>(Opcode::A64ReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U16 IREmitter::ReadMemory16(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U16>(Opcode::A64ReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ReadMemory32(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U64 IREmitter::ReadMemory64(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U64>(Opcode::A64ReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U128 IREmitter::ReadMemory128(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U128>(Opcode::A64ReadMemory128, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U8 IREmitter::ExclusiveReadMemory8(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U8>(Opcode::A64ExclusiveReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U16 IREmitter::ExclusiveReadMemory16(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U16>(Opcode::A64ExclusiveReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveReadMemory32(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U64 IREmitter::ExclusiveReadMemory64(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U64>(Opcode::A64ExclusiveReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+IR::U128 IREmitter::ExclusiveReadMemory128(const IR::U64& vaddr, IR::AccType acc_type) {
+ return Inst<IR::U128>(Opcode::A64ExclusiveReadMemory128, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type) {
+ Inst(Opcode::A64WriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type) {
+ Inst(Opcode::A64WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type) {
+ Inst(Opcode::A64WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type) {
+ Inst(Opcode::A64WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+void IREmitter::WriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type) {
+ Inst(Opcode::A64WriteMemory128, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::ExclusiveWriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type) {
+ return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory128, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type});
+}
+
+IR::U32 IREmitter::GetW(Reg reg) {
+ if (reg == Reg::ZR)
+ return Imm32(0);
+ return Inst<IR::U32>(Opcode::A64GetW, IR::Value(reg));
+}
+
+IR::U64 IREmitter::GetX(Reg reg) {
+ if (reg == Reg::ZR)
+ return Imm64(0);
+ return Inst<IR::U64>(Opcode::A64GetX, IR::Value(reg));
+}
+
+IR::U128 IREmitter::GetS(Vec vec) {
+ return Inst<IR::U128>(Opcode::A64GetS, IR::Value(vec));
+}
+
+IR::U128 IREmitter::GetD(Vec vec) {
+ return Inst<IR::U128>(Opcode::A64GetD, IR::Value(vec));
+}
+
+IR::U128 IREmitter::GetQ(Vec vec) {
+ return Inst<IR::U128>(Opcode::A64GetQ, IR::Value(vec));
+}
+
+IR::U64 IREmitter::GetSP() {
+ return Inst<IR::U64>(Opcode::A64GetSP);
+}
+
+IR::U32 IREmitter::GetFPCR() {
+ return Inst<IR::U32>(Opcode::A64GetFPCR);
+}
+
+IR::U32 IREmitter::GetFPSR() {
+ return Inst<IR::U32>(Opcode::A64GetFPSR);
+}
+
+void IREmitter::SetW(const Reg reg, const IR::U32& value) {
+ if (reg == Reg::ZR)
+ return;
+ Inst(Opcode::A64SetW, IR::Value(reg), value);
+}
+
+void IREmitter::SetX(const Reg reg, const IR::U64& value) {
+ if (reg == Reg::ZR)
+ return;
+ Inst(Opcode::A64SetX, IR::Value(reg), value);
+}
+
+void IREmitter::SetS(const Vec vec, const IR::U128& value) {
+ Inst(Opcode::A64SetS, IR::Value(vec), value);
+}
+
+void IREmitter::SetD(const Vec vec, const IR::U128& value) {
+ Inst(Opcode::A64SetD, IR::Value(vec), value);
+}
+
+void IREmitter::SetQ(const Vec vec, const IR::U128& value) {
+ Inst(Opcode::A64SetQ, IR::Value(vec), value);
+}
+
+void IREmitter::SetSP(const IR::U64& value) {
+ Inst(Opcode::A64SetSP, value);
+}
+
+void IREmitter::SetFPCR(const IR::U32& value) {
+ Inst(Opcode::A64SetFPCR, value);
+}
+
+void IREmitter::SetFPSR(const IR::U32& value) {
+ Inst(Opcode::A64SetFPSR, value);
+}
+
+void IREmitter::SetPC(const IR::U64& value) {
+ Inst(Opcode::A64SetPC, value);
+}
+
+IR::U64 IREmitter::ImmCurrentLocationDescriptor() {
+ return Imm64(IR::LocationDescriptor{*current_location}.Value());
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h
new file mode 100644
index 0000000000..7fc8bea7c4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h
@@ -0,0 +1,102 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/a64_types.h"
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/ir/ir_emitter.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::A64 {
+
+/**
+ * Convenience class to construct a basic block of the intermediate representation.
+ * `block` is the resulting block.
+ * The user of this class updates `current_location` as appropriate.
+ */
+class IREmitter : public IR::IREmitter {
+public:
+ explicit IREmitter(IR::Block& block)
+ : IR::IREmitter(block) {}
+ explicit IREmitter(IR::Block& block, LocationDescriptor descriptor)
+ : IR::IREmitter(block), current_location(descriptor) {}
+
+ std::optional<LocationDescriptor> current_location;
+
+ u64 PC() const;
+ u64 AlignPC(size_t alignment) const;
+
+ void SetCheckBit(const IR::U1& value);
+ IR::U1 GetCFlag();
+ IR::U32 GetNZCVRaw();
+ void SetNZCVRaw(IR::U32 value);
+ void SetNZCV(const IR::NZCV& nzcv);
+
+ void CallSupervisor(u32 imm);
+ void ExceptionRaised(Exception exception);
+ void DataCacheOperationRaised(DataCacheOperation op, const IR::U64& value);
+ void InstructionCacheOperationRaised(InstructionCacheOperation op, const IR::U64& value);
+ void DataSynchronizationBarrier();
+ void DataMemoryBarrier();
+ void InstructionSynchronizationBarrier();
+ IR::U32 GetCNTFRQ();
+ IR::U64 GetCNTPCT(); // TODO: Ensure sub-basic-block cycle counts are updated before this.
+ IR::U32 GetCTR();
+ IR::U32 GetDCZID();
+ IR::U64 GetTPIDR();
+ IR::U64 GetTPIDRRO();
+ void SetTPIDR(const IR::U64& value);
+
+ void ClearExclusive();
+ IR::U8 ReadMemory8(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U16 ReadMemory16(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U32 ReadMemory32(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U64 ReadMemory64(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U128 ReadMemory128(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U8 ExclusiveReadMemory8(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U16 ExclusiveReadMemory16(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U32 ExclusiveReadMemory32(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U64 ExclusiveReadMemory64(const IR::U64& vaddr, IR::AccType acc_type);
+ IR::U128 ExclusiveReadMemory128(const IR::U64& vaddr, IR::AccType acc_type);
+ void WriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type);
+ void WriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type);
+ void WriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type);
+ void WriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type);
+ void WriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type);
+ IR::U32 ExclusiveWriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type);
+
+ IR::U32 GetW(Reg source_reg);
+ IR::U64 GetX(Reg source_reg);
+ IR::U128 GetS(Vec source_vec);
+ IR::U128 GetD(Vec source_vec);
+ IR::U128 GetQ(Vec source_vec);
+ IR::U64 GetSP();
+ IR::U32 GetFPCR();
+ IR::U32 GetFPSR();
+ void SetW(Reg dest_reg, const IR::U32& value);
+ void SetX(Reg dest_reg, const IR::U64& value);
+ void SetS(Vec dest_vec, const IR::U128& value);
+ void SetD(Vec dest_vec, const IR::U128& value);
+ void SetQ(Vec dest_vec, const IR::U128& value);
+ void SetSP(const IR::U64& value);
+ void SetFPCR(const IR::U32& value);
+ void SetFPSR(const IR::U32& value);
+ void SetPC(const IR::U64& value);
+
+private:
+ IR::U64 ImmCurrentLocationDescriptor();
+};
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp
new file mode 100644
index 0000000000..83a931ff40
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+
+#include <fmt/format.h>
+
+namespace Dynarmic::A64 {
+
+std::string ToString(const LocationDescriptor& descriptor) {
+ return fmt::format("{{{}, {}{}}}", descriptor.PC(), descriptor.FPCR().Value(), descriptor.SingleStepping() ? ", step" : "");
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h
new file mode 100644
index 0000000000..122bebbcb7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h
@@ -0,0 +1,115 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <tuple>
+
+#include <fmt/format.h>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/fp/fpcr.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::A64 {
+
+/**
+ * LocationDescriptor describes the location of a basic block.
+ * The location is not solely based on the PC because other flags influence the way
+ * instructions should be translated.
+ */
+class LocationDescriptor {
+public:
+ static constexpr size_t pc_bit_count = 56;
+ static constexpr u64 pc_mask = mcl::bit::ones<u64>(pc_bit_count);
+ static constexpr u32 fpcr_mask = 0x07C8'0000;
+ static constexpr size_t fpcr_shift = 37;
+ static constexpr size_t single_stepping_bit = 57;
+ static_assert((pc_mask & (u64(fpcr_mask) << fpcr_shift) & (u64(1) << single_stepping_bit)) == 0);
+
+ LocationDescriptor(u64 pc, FP::FPCR fpcr, bool single_stepping = false)
+ : pc(pc & pc_mask), fpcr(fpcr.Value() & fpcr_mask), single_stepping(single_stepping) {}
+
+ explicit LocationDescriptor(const IR::LocationDescriptor& o)
+ : pc(o.Value() & pc_mask)
+ , fpcr((o.Value() >> fpcr_shift) & fpcr_mask)
+ , single_stepping(mcl::bit::get_bit<single_stepping_bit>(o.Value())) {}
+
+ u64 PC() const { return mcl::bit::sign_extend<pc_bit_count>(pc); }
+ FP::FPCR FPCR() const { return fpcr; }
+ bool SingleStepping() const { return single_stepping; }
+
+ bool operator==(const LocationDescriptor& o) const {
+ return std::tie(pc, fpcr, single_stepping) == std::tie(o.pc, o.fpcr, o.single_stepping);
+ }
+
+ bool operator!=(const LocationDescriptor& o) const {
+ return !operator==(o);
+ }
+
+ LocationDescriptor SetPC(u64 new_pc) const {
+ return LocationDescriptor(new_pc, fpcr, single_stepping);
+ }
+
+ LocationDescriptor AdvancePC(int amount) const {
+ return LocationDescriptor(static_cast<u64>(pc + amount), fpcr, single_stepping);
+ }
+
+ LocationDescriptor SetSingleStepping(bool new_single_stepping) const {
+ return LocationDescriptor(pc, fpcr, new_single_stepping);
+ }
+
+ u64 UniqueHash() const noexcept {
+ // This value MUST BE UNIQUE.
+ // This calculation has to match up with EmitTerminalPopRSBHint
+ const u64 fpcr_u64 = static_cast<u64>(fpcr.Value()) << fpcr_shift;
+ const u64 single_stepping_u64 = static_cast<u64>(single_stepping) << single_stepping_bit;
+ return pc | fpcr_u64 | single_stepping_u64;
+ }
+
+ operator IR::LocationDescriptor() const {
+ return IR::LocationDescriptor{UniqueHash()};
+ }
+
+private:
+ u64 pc; ///< Current program counter value.
+ FP::FPCR fpcr; ///< Floating point control register.
+ bool single_stepping;
+};
+
+/**
+ * Provides a string representation of a LocationDescriptor.
+ *
+ * @param descriptor The descriptor to get a string representation of
+ */
+std::string ToString(const LocationDescriptor& descriptor);
+
+} // namespace Dynarmic::A64
+
+namespace std {
+template<>
+struct less<Dynarmic::A64::LocationDescriptor> {
+ bool operator()(const Dynarmic::A64::LocationDescriptor& x, const Dynarmic::A64::LocationDescriptor& y) const noexcept {
+ return x.UniqueHash() < y.UniqueHash();
+ }
+};
+template<>
+struct hash<Dynarmic::A64::LocationDescriptor> {
+ size_t operator()(const Dynarmic::A64::LocationDescriptor& x) const noexcept {
+ return std::hash<u64>()(x.UniqueHash());
+ }
+};
+} // namespace std
+
+template<>
+struct fmt::formatter<Dynarmic::A64::LocationDescriptor> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A64::LocationDescriptor descriptor, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::A64::ToString(descriptor), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp
new file mode 100644
index 0000000000..ca1c59ab36
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp
@@ -0,0 +1,33 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/a64_types.h"
+
+#include <array>
+#include <ostream>
+
+#include <fmt/format.h>
+
+namespace Dynarmic::A64 {
+
+const char* CondToString(Cond cond) {
+ static constexpr std::array cond_strs = {
+ "eq", "ne", "hs", "lo", "mi", "pl", "vs", "vc",
+ "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"};
+ return cond_strs.at(static_cast<size_t>(cond));
+}
+
+std::string RegToString(Reg reg) {
+ if (reg == Reg::R31) {
+ return "sp|zr";
+ }
+ return fmt::format("r{}", static_cast<size_t>(reg));
+}
+
+std::string VecToString(Vec vec) {
+ return fmt::format("v{}", static_cast<size_t>(vec));
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h
new file mode 100644
index 0000000000..6bc7a24536
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h
@@ -0,0 +1,142 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/cond.h"
+
+namespace Dynarmic::A64 {
+
+using Cond = IR::Cond;
+
+enum class Reg {
+ R0,
+ R1,
+ R2,
+ R3,
+ R4,
+ R5,
+ R6,
+ R7,
+ R8,
+ R9,
+ R10,
+ R11,
+ R12,
+ R13,
+ R14,
+ R15,
+ R16,
+ R17,
+ R18,
+ R19,
+ R20,
+ R21,
+ R22,
+ R23,
+ R24,
+ R25,
+ R26,
+ R27,
+ R28,
+ R29,
+ R30,
+ R31,
+ LR = R30,
+ SP = R31,
+ ZR = R31,
+};
+
+enum class Vec {
+ V0,
+ V1,
+ V2,
+ V3,
+ V4,
+ V5,
+ V6,
+ V7,
+ V8,
+ V9,
+ V10,
+ V11,
+ V12,
+ V13,
+ V14,
+ V15,
+ V16,
+ V17,
+ V18,
+ V19,
+ V20,
+ V21,
+ V22,
+ V23,
+ V24,
+ V25,
+ V26,
+ V27,
+ V28,
+ V29,
+ V30,
+ V31,
+};
+
+enum class ShiftType {
+ LSL,
+ LSR,
+ ASR,
+ ROR,
+};
+
+const char* CondToString(Cond cond);
+std::string RegToString(Reg reg);
+std::string VecToString(Vec vec);
+
+constexpr size_t RegNumber(Reg reg) {
+ return static_cast<size_t>(reg);
+}
+
+constexpr size_t VecNumber(Vec vec) {
+ return static_cast<size_t>(vec);
+}
+
+inline Reg operator+(Reg reg, size_t number) {
+ const size_t new_reg = RegNumber(reg) + number;
+ ASSERT(new_reg <= 31);
+
+ return static_cast<Reg>(new_reg);
+}
+
+inline Vec operator+(Vec vec, size_t number) {
+ const size_t new_vec = VecNumber(vec) + number;
+ ASSERT(new_vec <= 31);
+
+ return static_cast<Vec>(new_vec);
+}
+
+} // namespace Dynarmic::A64
+
+template<>
+struct fmt::formatter<Dynarmic::A64::Reg> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A64::Reg reg, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::A64::RegToString(reg), ctx);
+ }
+};
+
+template<>
+struct fmt::formatter<Dynarmic::A64::Vec> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::A64::Vec vec, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::A64::VecToString(vec), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h
new file mode 100644
index 0000000000..d6f447e575
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h
@@ -0,0 +1,83 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/decoder/decoder_detail.h"
+#include "dynarmic/frontend/decoder/matcher.h"
+
+namespace Dynarmic::A64 {
+
+template<typename Visitor>
+using Matcher = Decoder::Matcher<Visitor, u32>;
+
+template<typename Visitor>
+using DecodeTable = std::array<std::vector<Matcher<Visitor>>, 0x1000>;
+
+namespace detail {
+inline size_t ToFastLookupIndex(u32 instruction) {
+ return ((instruction >> 10) & 0x00F) | ((instruction >> 18) & 0xFF0);
+}
+} // namespace detail
+
+template<typename V>
+DecodeTable<V> GetDecodeTable() {
+ std::vector<Matcher<V>> list = {
+#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Matcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)),
+#include "./a64.inc"
+#undef INST
+ };
+
+ std::stable_sort(list.begin(), list.end(), [](const auto& matcher1, const auto& matcher2) {
+ // If a matcher has more bits in its mask it is more specific, so it should come first.
+ return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask());
+ });
+
+ // Exceptions to the above rule of thumb.
+ const std::set<std::string> comes_first{
+ "MOVI, MVNI, ORR, BIC (vector, immediate)",
+ "FMOV (vector, immediate)",
+ "Unallocated SIMD modified immediate",
+ };
+
+ std::stable_partition(list.begin(), list.end(), [&](const auto& matcher) {
+ return comes_first.count(matcher.GetName()) > 0;
+ });
+
+ DecodeTable<V> table{};
+ for (size_t i = 0; i < table.size(); ++i) {
+ for (auto matcher : list) {
+ const auto expect = detail::ToFastLookupIndex(matcher.GetExpected());
+ const auto mask = detail::ToFastLookupIndex(matcher.GetMask());
+ if ((i & mask) == expect) {
+ table[i].push_back(matcher);
+ }
+ }
+ }
+ return table;
+}
+
+template<typename V>
+std::optional<std::reference_wrapper<const Matcher<V>>> Decode(u32 instruction) {
+ static const auto table = GetDecodeTable<V>();
+
+ const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
+
+ const auto& subtable = table[detail::ToFastLookupIndex(instruction)];
+ auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction);
+ return iter != subtable.end() ? std::optional<std::reference_wrapper<const Matcher<V>>>(*iter) : std::nullopt;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc
new file mode 100644
index 0000000000..23f8b71933
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc
@@ -0,0 +1,1029 @@
+// Data processing - Immediate - PC relative addressing
+INST(ADR, "ADR", "0ii10000iiiiiiiiiiiiiiiiiiiddddd")
+INST(ADRP, "ADRP", "1ii10000iiiiiiiiiiiiiiiiiiiddddd")
+
+// Data processing - Immediate - Add/Sub (with tags)
+//INST(ADDG, "ADDG", "1001000110iiiiii00IIIInnnnnddddd") // ARMv8.5
+//INST(SUBG, "SUBG", "1101000110iiiiii00IIIInnnnnddddd") // ARMv8.5
+
+// Data processing - Immediate - Add/Sub
+INST(ADD_imm, "ADD (immediate)", "z0010001ssiiiiiiiiiiiinnnnnddddd")
+INST(ADDS_imm, "ADDS (immediate)", "z0110001ssiiiiiiiiiiiinnnnnddddd")
+INST(SUB_imm, "SUB (immediate)", "z1010001ssiiiiiiiiiiiinnnnnddddd")
+INST(SUBS_imm, "SUBS (immediate)", "z1110001ssiiiiiiiiiiiinnnnnddddd")
+
+// Data processing - Immediate - Logical
+INST(AND_imm, "AND (immediate)", "z00100100Nrrrrrrssssssnnnnnddddd")
+INST(ORR_imm, "ORR (immediate)", "z01100100Nrrrrrrssssssnnnnnddddd")
+INST(EOR_imm, "EOR (immediate)", "z10100100Nrrrrrrssssssnnnnnddddd")
+INST(ANDS_imm, "ANDS (immediate)", "z11100100Nrrrrrrssssssnnnnnddddd")
+
+// Data processing - Immediate - Move Wide
+INST(MOVN, "MOVN", "z00100101ssiiiiiiiiiiiiiiiiddddd")
+INST(MOVZ, "MOVZ", "z10100101ssiiiiiiiiiiiiiiiiddddd")
+INST(MOVK, "MOVK", "z11100101ssiiiiiiiiiiiiiiiiddddd")
+
+// Data processing - Immediate - Bitfield
+INST(SBFM, "SBFM", "z00100110Nrrrrrrssssssnnnnnddddd")
+INST(BFM, "BFM", "z01100110Nrrrrrrssssssnnnnnddddd")
+INST(UBFM, "UBFM", "z10100110Nrrrrrrssssssnnnnnddddd")
+INST(ASR_1, "ASR (immediate, 32-bit)", "00010011000rrrrr011111nnnnnddddd")
+INST(ASR_2, "ASR (immediate, 64-bit)", "1001001101rrrrrr111111nnnnnddddd")
+INST(SXTB_1, "SXTB (32-bit)", "0001001100000000000111nnnnnddddd")
+INST(SXTB_2, "SXTB (64-bit)", "1001001101000000000111nnnnnddddd")
+INST(SXTH_1, "SXTH (32-bit)", "0001001100000000001111nnnnnddddd")
+INST(SXTH_2, "SXTH (64-bit)", "1001001101000000001111nnnnnddddd")
+INST(SXTW, "SXTW", "1001001101000000011111nnnnnddddd")
+
+// Data processing - Immediate - Extract
+INST(EXTR, "EXTR", "z00100111N0mmmmmssssssnnnnnddddd")
+
+// Conditional branch
+INST(B_cond, "B.cond", "01010100iiiiiiiiiiiiiiiiiii0cccc")
+
+// Exception generation
+INST(SVC, "SVC", "11010100000iiiiiiiiiiiiiiii00001")
+//INST(HVC, "HVC", "11010100000iiiiiiiiiiiiiiii00010")
+//INST(SMC, "SMC", "11010100000iiiiiiiiiiiiiiii00011")
+INST(BRK, "BRK", "11010100001iiiiiiiiiiiiiiii00000")
+//INST(HLT, "HLT", "11010100010iiiiiiiiiiiiiiii00000")
+//INST(DCPS1, "DCPS1", "11010100101iiiiiiiiiiiiiiii00001")
+//INST(DCPS2, "DCPS2", "11010100101iiiiiiiiiiiiiiii00010")
+//INST(DCPS3, "DCPS3", "11010100101iiiiiiiiiiiiiiii00011")
+
+// System
+//INST(MSR_imm, "MSR (immediate)", "1101010100000ooo0100MMMMooo11111")
+INST(HINT, "HINT", "11010101000000110010MMMMooo11111")
+INST(NOP, "NOP", "11010101000000110010000000011111")
+INST(YIELD, "YIELD", "11010101000000110010000000111111")
+INST(WFE, "WFE", "11010101000000110010000001011111")
+INST(WFI, "WFI", "11010101000000110010000001111111")
+INST(SEV, "SEV", "11010101000000110010000010011111")
+INST(SEVL, "SEVL", "11010101000000110010000010111111")
+//INST(DGH, "DGH", "11010101000000110010000011011111") // v8.6
+//INST(WFET, "WFET", "110101010000001100010000000ddddd") // v8.7
+//INST(WFIT, "WFIT", "110101010000001100010000001ddddd") // v8.7
+//INST(XPAC_1, "XPACD, XPACI, XPACLRI", "110110101100000101000D11111ddddd")
+//INST(XPAC_2, "XPACD, XPACI, XPACLRI", "11010101000000110010000011111111")
+//INST(PACIA_1, "PACIA, PACIA1716, PACIASP, PACIAZ, PACIZA", "110110101100000100Z000nnnnnddddd")
+//INST(PACIA_2, "PACIA, PACIA1716, PACIASP, PACIAZ, PACIZA", "1101010100000011001000-100-11111")
+//INST(PACIB_1, "PACIB, PACIB1716, PACIBSP, PACIBZ, PACIZB", "110110101100000100Z001nnnnnddddd")
+//INST(PACIB_2, "PACIB, PACIB1716, PACIBSP, PACIBZ, PACIZB", "1101010100000011001000-101-11111")
+//INST(AUTIA_1, "AUTIA, AUTIA1716, AUTIASP, AUTIAZ, AUTIZA", "110110101100000100Z100nnnnnddddd")
+//INST(AUTIA_2, "AUTIA, AUTIA1716, AUTIASP, AUTIAZ, AUTIZA", "1101010100000011001000-110-11111")
+//INST(AUTIB_1, "AUTIB, AUTIB1716, AUTIBSP, AUTIBZ, AUTIZB", "110110101100000100Z101nnnnnddddd")
+//INST(AUTIB_2, "AUTIB, AUTIB1716, AUTIBSP, AUTIBZ, AUTIZB", "1101010100000011001000-111-11111")
+//INST(BTI, "BTI", "110101010000001100100100ii011111") // ARMv8.5
+//INST(ESB, "ESB", "11010101000000110010001000011111")
+//INST(PSB, "PSB CSYNC", "11010101000000110010001000111111")
+//INST(TSB, "TSB CSYNC", "11010101000000110010001001011111") // ARMv8.5
+//INST(CSDB, "CSDB", "11010101000000110010001010011111")
+INST(CLREX, "CLREX", "11010101000000110011MMMM01011111")
+INST(DSB, "DSB", "11010101000000110011MMMM10011111")
+//INST(SSBB, "SSBB", "11010101000000110011000010011111")
+//INST(PSSBB, "PSSBB", "11010101000000110011010010011111")
+INST(DMB, "DMB", "11010101000000110011MMMM10111111")
+INST(ISB, "ISB", "11010101000000110011MMMM11011111")
+//INST(SB, "SB", "11010101000000110011000011111111")
+//INST(SYS, "SYS", "1101010100001oooNNNNMMMMooottttt")
+INST(MSR_reg, "MSR (register)", "110101010001poooNNNNMMMMooottttt")
+//INST(SYSL, "SYSL", "1101010100101oooNNNNMMMMooottttt")
+INST(MRS, "MRS", "110101010011poooNNNNMMMMooottttt")
+
+// System - Flag manipulation instructions
+INST(CFINV, "CFINV", "11010101000000000100000000011111") // ARMv8.4
+INST(RMIF, "RMIF", "10111010000iiiiii00001nnnnn0IIII") // ARMv8.4
+//INST(SETF8, "SETF8", "0011101000000000000010nnnnn01101") // ARMv8.4
+//INST(SETF16, "SETF16", "0011101000000000010010nnnnn01101") // ARMv8.4
+
+// System - Flag format instructions
+INST(XAFlag, "XAFlag", "11010101000000000100000000111111") // ARMv8.5
+INST(AXFlag, "AXFlag", "11010101000000000100000001011111") // ARMv8.5
+
+// SYS: Data Cache
+INST(DC_IVAC, "DC IVAC", "110101010000100001110110001ttttt")
+INST(DC_ISW, "DC ISW", "110101010000100001110110010ttttt")
+INST(DC_CSW, "DC CSW", "110101010000100001111010010ttttt")
+INST(DC_CISW, "DC CISW", "110101010000100001111110010ttttt")
+INST(DC_ZVA, "DC ZVA", "110101010000101101110100001ttttt")
+INST(DC_CVAC, "DC CVAC", "110101010000101101111010001ttttt")
+INST(DC_CVAU, "DC CVAU", "110101010000101101111011001ttttt")
+INST(DC_CVAP, "DC CVAP", "110101010000101101111100001ttttt")
+INST(DC_CIVAC, "DC CIVAC", "110101010000101101111110001ttttt")
+
+// SYS: Instruction Cache
+INST(IC_IALLU, "IC IALLU", "11010101000010000111010100011111")
+INST(IC_IALLUIS, "IC IALLUIS", "11010101000010000111000100011111")
+INST(IC_IVAU, "IC IVAU", "110101010000101101110101001ttttt")
+
+// Unconditional branch (Register)
+INST(BLR, "BLR", "1101011000111111000000nnnnn00000")
+INST(BR, "BR", "1101011000011111000000nnnnn00000")
+//INST(DRPS, "DRPS", "11010110101111110000001111100000")
+//INST(ERET, "ERET", "11010110100111110000001111100000")
+INST(RET, "RET", "1101011001011111000000nnnnn00000")
+//INST(BLRA, "BLRAA, BLRAAZ, BLRAB, BLRABZ", "1101011Z0011111100001Mnnnnnmmmmm") // ARMv8.3
+//INST(BRA, "BRAA, BRAAZ, BRAB, BRABZ", "1101011Z0001111100001Mnnnnnmmmmm") // ARMv8.3
+//INST(ERETA, "ERETAA, ERETAB", "110101101001111100001M1111111111") // ARMv8.3
+//INST(RETA, "RETAA, RETAB", "110101100101111100001M1111111111") // ARMv8.3
+
+// Unconditional branch (immediate)
+INST(B_uncond, "B", "000101iiiiiiiiiiiiiiiiiiiiiiiiii")
+INST(BL, "BL", "100101iiiiiiiiiiiiiiiiiiiiiiiiii")
+
+// Compare and branch (immediate)
+INST(CBZ, "CBZ", "z0110100iiiiiiiiiiiiiiiiiiittttt")
+INST(CBNZ, "CBNZ", "z0110101iiiiiiiiiiiiiiiiiiittttt")
+INST(TBZ, "TBZ", "b0110110bbbbbiiiiiiiiiiiiiittttt")
+INST(TBNZ, "TBNZ", "b0110111bbbbbiiiiiiiiiiiiiittttt")
+
+// Loads and stores - Advanced SIMD Load/Store multiple structures
+INST(STx_mult_1, "STx (multiple structures)", "0Q00110000000000oooozznnnnnttttt")
+INST(STx_mult_2, "STx (multiple structures)", "0Q001100100mmmmmoooozznnnnnttttt")
+INST(LDx_mult_1, "LDx (multiple structures)", "0Q00110001000000oooozznnnnnttttt")
+INST(LDx_mult_2, "LDx (multiple structures)", "0Q001100110mmmmmoooozznnnnnttttt")
+
+// Loads and stores - Advanced SIMD Load/Store single structures
+INST(ST1_sngl_1, "ST1 (single structure)", "0Q00110100000000oo0Szznnnnnttttt")
+INST(ST1_sngl_2, "ST1 (single structure)", "0Q001101100mmmmmoo0Szznnnnnttttt")
+INST(ST3_sngl_1, "ST3 (single structure)", "0Q00110100000000oo1Szznnnnnttttt")
+INST(ST3_sngl_2, "ST3 (single structure)", "0Q001101100mmmmmoo1Szznnnnnttttt")
+INST(ST2_sngl_1, "ST2 (single structure)", "0Q00110100100000oo0Szznnnnnttttt")
+INST(ST2_sngl_2, "ST2 (single structure)", "0Q001101101mmmmmoo0Szznnnnnttttt")
+INST(ST4_sngl_1, "ST4 (single structure)", "0Q00110100100000oo1Szznnnnnttttt")
+INST(ST4_sngl_2, "ST4 (single structure)", "0Q001101101mmmmmoo1Szznnnnnttttt")
+INST(LD1_sngl_1, "LD1 (single structure)", "0Q00110101000000oo0Szznnnnnttttt")
+INST(LD1_sngl_2, "LD1 (single structure)", "0Q001101110mmmmmoo0Szznnnnnttttt")
+INST(LD3_sngl_1, "LD3 (single structure)", "0Q00110101000000oo1Szznnnnnttttt")
+INST(LD3_sngl_2, "LD3 (single structure)", "0Q001101110mmmmmoo1Szznnnnnttttt")
+INST(LD1R_1, "LD1R", "0Q001101010000001100zznnnnnttttt")
+INST(LD1R_2, "LD1R", "0Q001101110mmmmm1100zznnnnnttttt")
+INST(LD3R_1, "LD3R", "0Q001101010000001110zznnnnnttttt")
+INST(LD3R_2, "LD3R", "0Q001101110mmmmm1110zznnnnnttttt")
+INST(LD2_sngl_1, "LD2 (single structure)", "0Q00110101100000oo0Szznnnnnttttt")
+INST(LD2_sngl_2, "LD2 (single structure)", "0Q001101111mmmmmoo0Szznnnnnttttt")
+INST(LD4_sngl_1, "LD4 (single structure)", "0Q00110101100000oo1Szznnnnnttttt")
+INST(LD4_sngl_2, "LD4 (single structure)", "0Q001101111mmmmmoo1Szznnnnnttttt")
+INST(LD2R_1, "LD2R", "0Q001101011000001100zznnnnnttttt")
+INST(LD2R_2, "LD2R", "0Q001101111mmmmm1100zznnnnnttttt")
+INST(LD4R_1, "LD4R", "0Q001101011000001110zznnnnnttttt")
+INST(LD4R_2, "LD4R", "0Q001101111mmmmm1110zznnnnnttttt")
+
+// Loads and stores - Load/Store Exclusive
+INST(STXR, "STXRB, STXRH, STXR", "zz001000000sssss011111nnnnnttttt")
+INST(STLXR, "STLXRB, STLXRH, STLXR", "zz001000000sssss111111nnnnnttttt")
+INST(STXP, "STXP", "1z001000001sssss0uuuuunnnnnttttt")
+INST(STLXP, "STLXP", "1z001000001sssss1uuuuunnnnnttttt")
+INST(LDXR, "LDXRB, LDXRH, LDXR", "zz00100001011111011111nnnnnttttt")
+INST(LDAXR, "LDAXRB, LDAXRH, LDAXR", "zz00100001011111111111nnnnnttttt")
+INST(LDXP, "LDXP", "1z001000011111110uuuuunnnnnttttt")
+INST(LDAXP, "LDAXP", "1z001000011111111uuuuunnnnnttttt")
+INST(STLLR, "STLLRB, STLLRH, STLLR", "zz00100010011111011111nnnnnttttt")
+INST(STLR, "STLRB, STLRH, STLR", "zz00100010011111111111nnnnnttttt")
+INST(LDLAR, "LDLARB, LDLARH, LDLAR", "zz00100011011111011111nnnnnttttt")
+INST(LDAR, "LDARB, LDARH, LDAR", "zz00100011011111111111nnnnnttttt")
+//INST(CASP, "CASP, CASPA, CASPAL, CASPL", "0z0010000L1sssssp11111nnnnnttttt") // ARMv8.1
+//INST(CASB, "CASB, CASAB, CASALB, CASLB", "000010001L1sssssp11111nnnnnttttt") // ARMv8.1
+//INST(CASH, "CASH, CASAH, CASALH, CASLH", "010010001L1sssssp11111nnnnnttttt") // ARMv8.1
+//INST(CAS, "CAS, CASA, CASAL, CASL", "1z0010001L1sssssp11111nnnnnttttt") // ARMv8.1
+
+// Loads and stores - Load register (literal)
+INST(LDR_lit_gen, "LDR (literal)", "0z011000iiiiiiiiiiiiiiiiiiittttt")
+INST(LDRSW_lit, "LDRSW (literal)", "10011000iiiiiiiiiiiiiiiiiiittttt")
+INST(PRFM_lit, "PRFM (literal)", "11011000iiiiiiiiiiiiiiiiiiittttt")
+INST(LDR_lit_fpsimd, "LDR (literal, SIMD&FP)", "oo011100iiiiiiiiiiiiiiiiiiittttt")
+
+// Loads and stores - Load/Store no-allocate pair
+INST(STNP_LDNP_gen, "STNP/LDNP", "o01010000Liiiiiiiuuuuunnnnnttttt")
+INST(STNP_LDNP_fpsimd, "STNP/LDNP (SIMD&FP)", "oo1011000Liiiiiiiuuuuunnnnnttttt")
+
+// Loads and stores - Load/Store register pair
+INST(STP_LDP_gen, "STP/LDP", "oo10100pwLiiiiiiiuuuuunnnnnttttt")
+INST(UnallocatedEncoding, "", "--1010000-----------------------")
+INST(STP_LDP_fpsimd, "STP/LDP (SIMD&FP)", "oo10110pwLiiiiiiiuuuuunnnnnttttt")
+INST(UnallocatedEncoding, "", "--1011000-----------------------")
+
+// Loads and stores - Load/Store register (unscaled immediate)
+INST(STURx_LDURx, "STURx/LDURx", "zz111000oo0iiiiiiiii00nnnnnttttt")
+INST(UnallocatedEncoding, "", "111110001-0---------00----------")
+INST(UnallocatedEncoding, "", "10111000110---------00----------")
+INST(PRFM_imm, "PRFM (immediate)", "1111100110iiiiiiiiiiiinnnnnttttt")
+INST(PRFM_unscaled_imm, "PRFM (unscaled offset)", "11111000100iiiiiiiii00nnnnnttttt")
+INST(STUR_fpsimd, "STUR (SIMD&FP)", "zz111100o00iiiiiiiii00nnnnnttttt")
+INST(LDUR_fpsimd, "LDUR (SIMD&FP)", "zz111100o10iiiiiiiii00nnnnnttttt")
+
+// Loads and stores - Load/Store register (immediate pre/post-indexed)
+INST(STRx_LDRx_imm_1, "STRx/LDRx (immediate)", "zz111000oo0iiiiiiiiip1nnnnnttttt")
+INST(STRx_LDRx_imm_2, "STRx/LDRx (immediate)", "zz111001ooiiiiiiiiiiiinnnnnttttt")
+INST(UnallocatedEncoding, "", "111110001-0----------1----------")
+INST(UnallocatedEncoding, "", "10111000110----------1----------")
+INST(UnallocatedEncoding, "", "1111100111----------------------")
+INST(UnallocatedEncoding, "", "1011100111----------------------")
+INST(STR_imm_fpsimd_1, "STR (immediate, SIMD&FP)", "zz111100o00iiiiiiiiip1nnnnnttttt")
+INST(STR_imm_fpsimd_2, "STR (immediate, SIMD&FP)", "zz111101o0iiiiiiiiiiiinnnnnttttt")
+INST(LDR_imm_fpsimd_1, "LDR (immediate, SIMD&FP)", "zz111100o10iiiiiiiiip1nnnnnttttt")
+INST(LDR_imm_fpsimd_2, "LDR (immediate, SIMD&FP)", "zz111101o1iiiiiiiiiiiinnnnnttttt")
+//INST(STGP_1, "STGP (post-index)", "0110100010iiiiiiimmmmmnnnnnttttt") // ARMv8.5
+//INST(STGP_2, "STGP (pre-index)", "0110100110iiiiiiimmmmmnnnnnttttt") // ARMv8.5
+//INST(STGP_3, "STGP (signed-offset)", "0110100100iiiiiiimmmmmnnnnnttttt") // ARMv8.5
+
+// Loads and stores - Load/Store register (unprivileged)
+INST(STTRB, "STTRB", "00111000000iiiiiiiii10nnnnnttttt")
+INST(LDTRB, "LDTRB", "00111000010iiiiiiiii10nnnnnttttt")
+INST(LDTRSB, "LDTRSB", "00111000oo0iiiiiiiii10nnnnnttttt")
+INST(STTRH, "STTRH", "01111000000iiiiiiiii10nnnnnttttt")
+INST(LDTRH, "LDTRH", "01111000010iiiiiiiii10nnnnnttttt")
+INST(LDTRSH, "LDTRSH", "01111000oo0iiiiiiiii10nnnnnttttt")
+INST(STTR, "STTR", "zz111000000iiiiiiiii10nnnnnttttt")
+INST(LDTR, "LDTR", "zz111000010iiiiiiiii10nnnnnttttt")
+INST(LDTRSW, "LDTRSW", "10111000100iiiiiiiii10nnnnnttttt")
+
+// Loads and stores - Atomic memory options
+//INST(LDADDB, "LDADDB, LDADDAB, LDADDALB, LDADDLB", "00111000AR1sssss000000nnnnnttttt")
+//INST(LDCLRB, "LDCLRB, LDCLRAB, LDCLRALB, LDCLRLB", "00111000AR1sssss000100nnnnnttttt")
+//INST(LDEORB, "LDEORB, LDEORAB, LDEORALB, LDEORLB", "00111000AR1sssss001000nnnnnttttt")
+//INST(LDSETB, "LDSETB, LDSETAB, LDSETALB, LDSETLB", "00111000AR1sssss001100nnnnnttttt")
+//INST(LDSMAXB, "LDSMAXB, LDSMAXAB, LDSMAXALB, LDSMAXLB", "00111000AR1sssss010000nnnnnttttt")
+//INST(LDSMINB, "LDSMINB, LDSMINAB, LDSMINALB, LDSMINLB", "00111000AR1sssss010100nnnnnttttt")
+//INST(LDUMAXB, "LDUMAXB, LDUMAXAB, LDUMAXALB, LDUMAXLB", "00111000AR1sssss011000nnnnnttttt")
+//INST(LDUMINB, "LDUMINB, LDUMINAB, LDUMINALB, LDUMINLB", "00111000AR1sssss011100nnnnnttttt")
+//INST(SWPB, "SWPB, SWPAB, SWPALB, SWPLB", "00111000AR1sssss100000nnnnnttttt")
+//INST(LDAPRB, "LDAPRB", "0011100010111111110000nnnnnttttt")
+//INST(LDADDH, "LDADDH, LDADDAH, LDADDALH, LDADDLH", "01111000AR1sssss000000nnnnnttttt")
+//INST(LDCLRH, "LDCLRH, LDCLRAH, LDCLRALH, LDCLRLH", "01111000AR1sssss000100nnnnnttttt")
+//INST(LDEORH, "LDEORH, LDEORAH, LDEORALH, LDEORLH", "01111000AR1sssss001000nnnnnttttt")
+//INST(LDSETH, "LDSETH, LDSETAH, LDSETALH, LDSETLH", "01111000AR1sssss001100nnnnnttttt")
+//INST(LDSMAXH, "LDSMAXH, LDSMAXAH, LDSMAXALH, LDSMAXLH", "01111000AR1sssss010000nnnnnttttt")
+//INST(LDSMINH, "LDSMINH, LDSMINAH, LDSMINALH, LDSMINLH", "01111000AR1sssss010100nnnnnttttt")
+//INST(LDUMAXH, "LDUMAXH, LDUMAXAH, LDUMAXALH, LDUMAXLH", "01111000AR1sssss011000nnnnnttttt")
+//INST(LDUMINH, "LDUMINH, LDUMINAH, LDUMINALH, LDUMINLH", "01111000AR1sssss011100nnnnnttttt")
+//INST(SWPH, "SWPH, SWPAH, SWPALH, SWPLH", "01111000AR1sssss100000nnnnnttttt")
+//INST(LDAPRH, "LDAPRH", "0111100010111111110000nnnnnttttt")
+//INST(LDADD, "LDADD, LDADDA, LDADDAL, LDADDL", "1-111000AR1sssss000000nnnnnttttt")
+//INST(LDCLR, "LDCLR, LDCLRA, LDCLRAL, LDCLRL", "1-111000AR1sssss000100nnnnnttttt")
+//INST(LDEOR, "LDEOR, LDEORA, LDEORAL, LDEORL", "1-111000AR1sssss001000nnnnnttttt")
+//INST(LDSET, "LDSET, LDSETA, LDSETAL, LDSETL", "1-111000AR1sssss001100nnnnnttttt")
+//INST(LDSMAX, "LDSMAX, LDSMAXA, LDSMAXAL, LDSMAXL", "1-111000AR1sssss010000nnnnnttttt")
+//INST(LDSMIN, "LDSMIN, LDSMINA, LDSMINAL, LDSMINL", "1-111000AR1sssss010100nnnnnttttt")
+//INST(LDUMAX, "LDUMAX, LDUMAXA, LDUMAXAL, LDUMAXL", "1-111000AR1sssss011000nnnnnttttt")
+//INST(LDUMIN, "LDUMIN, LDUMINA, LDUMINAL, LDUMINL", "1-111000AR1sssss011100nnnnnttttt")
+//INST(SWP, "SWP, SWPA, SWPAL, SWPL", "1-111000AR1sssss100000nnnnnttttt")
+//INST(LDAPR, "LDAPR", "1-11100010111111110000nnnnnttttt")
+//INST(LD64B, "LD64B", "1111100000111111110100nnnnnttttt") // v8.7
+//INST(ST64B, "ST64B", "1111100000111111100100nnnnnttttt") // v8.7
+//INST(ST64BV, "ST64BV", "11111000001sssss101100nnnnnttttt") // v8.7
+//INST(ST64BV0, "ST64BV0", "11111000001sssss101000nnnnnttttt") // v8.7
+
+// Loads and stores - Load/Store register (register offset)
+INST(STRx_reg, "STRx (register)", "zz111000o01mmmmmxxxS10nnnnnttttt")
+INST(LDRx_reg, "LDRx (register)", "zz111000o11mmmmmxxxS10nnnnnttttt")
+INST(STR_reg_fpsimd, "STR (register, SIMD&FP)", "zz111100o01mmmmmxxxS10nnnnnttttt")
+INST(LDR_reg_fpsimd, "LDR (register, SIMD&FP)", "zz111100o11mmmmmxxxS10nnnnnttttt")
+
+// Loads and stores - Load/Store memory tags
+//INST(STG_1, "STG (post-index)", "11011001001iiiiiiiii01nnnnn11111") // ARMv8.5
+//INST(STG_2, "STG (pre-index)", "11011001001iiiiiiiii11nnnnn11111") // ARMv8.5
+//INST(STG_3, "STG (signed-offset)", "11011001001iiiiiiiii10nnnnn11111") // ARMv8.5
+//INST(LDG, "LDG", "11011001011iiiiiiiii00nnnnnttttt") // ARMv8.5
+//INST(STZG_1, "STZG (post-index)", "11011001011iiiiiiiii01nnnnn11111") // ARMv8.5
+//INST(STZG_2, "STZG (pre-index)", "11011001011iiiiiiiii11nnnnn11111") // ARMv8.5
+//INST(STZG_3, "STZG (signed-offset)", "11011001011iiiiiiiii10nnnnn11111") // ARMv8.5
+//INST(ST2G_1, "ST2G (post-index)", "11011001101iiiiiiiii01nnnnn11111") // ARMv8.5
+//INST(ST2G_2, "ST2G (pre-index)", "11011001101iiiiiiiii11nnnnn11111") // ARMv8.5
+//INST(ST2G_3, "ST2G (signed-offset)", "11011001101iiiiiiiii10nnnnn11111") // ARMv8.5
+//INST(STGV, "STGV", "1101100110100000000000nnnnnttttt") // ARMv8.5
+//INST(STZ2G_1, "STZ2G (post-index)", "11011001111iiiiiiiii01nnnnn11111") // ARMv8.5
+//INST(STZ2G_2, "STZ2G (pre-index)", "11011001111iiiiiiiii11nnnnn11111") // ARMv8.5
+//INST(STZ2G_3, "STZ2G (signed-offset)", "11011001111iiiiiiiii10nnnnn11111") // ARMv8.5
+//INST(LDGV, "LDGV", "1101100111100000000000nnnnnttttt") // ARMv8.5
+
+// Loads and stores - Load/Store register (pointer authentication)
+//INST(LDRA, "LDRAA, LDRAB", "11111000MS1iiiiiiiiiW1nnnnnttttt")
+
+// Data Processing - Register - 2 source
+INST(UDIV, "UDIV", "z0011010110mmmmm000010nnnnnddddd")
+INST(SDIV, "SDIV", "z0011010110mmmmm000011nnnnnddddd")
+INST(LSLV, "LSLV", "z0011010110mmmmm001000nnnnnddddd")
+INST(LSRV, "LSRV", "z0011010110mmmmm001001nnnnnddddd")
+INST(ASRV, "ASRV", "z0011010110mmmmm001010nnnnnddddd")
+INST(RORV, "RORV", "z0011010110mmmmm001011nnnnnddddd")
+INST(CRC32, "CRC32B, CRC32H, CRC32W, CRC32X", "z0011010110mmmmm0100zznnnnnddddd")
+INST(CRC32C, "CRC32CB, CRC32CH, CRC32CW, CRC32CX", "z0011010110mmmmm0101zznnnnnddddd")
+//INST(PACGA, "PACGA", "10011010110mmmmm001100nnnnnddddd")
+//INST(SUBP, "SUBP", "10011010110mmmmm000000nnnnnddddd") // ARMv8.5
+//INST(IRG, "IRG", "10011010110mmmmm000100nnnnnddddd") // ARMv8.5
+//INST(GMI, "GMI", "10011010110mmmmm000101nnnnnddddd") // ARMv8.5
+//INST(SUBPS, "SUBPS", "10111010110mmmmm000000nnnnnddddd") // ARMv8.5
+
+// Data Processing - Register - 1 source
+INST(RBIT_int, "RBIT", "z101101011000000000000nnnnnddddd")
+INST(REV16_int, "REV16", "z101101011000000000001nnnnnddddd")
+INST(REV, "REV", "z10110101100000000001onnnnnddddd")
+INST(CLZ_int, "CLZ", "z101101011000000000100nnnnnddddd")
+INST(CLS_int, "CLS", "z101101011000000000101nnnnnddddd")
+INST(REV32_int, "REV32", "1101101011000000000010nnnnnddddd")
+//INST(PACDA, "PACDA, PACDZA", "110110101100000100Z010nnnnnddddd")
+//INST(PACDB, "PACDB, PACDZB", "110110101100000100Z011nnnnnddddd")
+//INST(AUTDA, "AUTDA, AUTDZA", "110110101100000100Z110nnnnnddddd")
+//INST(AUTDB, "AUTDB, AUTDZB", "110110101100000100Z111nnnnnddddd")
+
+// Data Processing - Register - Logical (shifted register)
+INST(AND_shift, "AND (shifted register)", "z0001010ss0mmmmmiiiiiinnnnnddddd")
+INST(BIC_shift, "BIC (shifted register)", "z0001010ss1mmmmmiiiiiinnnnnddddd")
+INST(ORR_shift, "ORR (shifted register)", "z0101010ss0mmmmmiiiiiinnnnnddddd")
+INST(ORN_shift, "ORN (shifted register)", "z0101010ss1mmmmmiiiiiinnnnnddddd")
+INST(EOR_shift, "EOR (shifted register)", "z1001010ss0mmmmmiiiiiinnnnnddddd")
+INST(EON, "EON (shifted register)", "z1001010ss1mmmmmiiiiiinnnnnddddd")
+INST(ANDS_shift, "ANDS (shifted register)", "z1101010ss0mmmmmiiiiiinnnnnddddd")
+INST(BICS, "BICS (shifted register)", "z1101010ss1mmmmmiiiiiinnnnnddddd")
+
+// Data Processing - Register - Add/Sub (shifted register)
+INST(ADD_shift, "ADD (shifted register)", "z0001011ss0mmmmmiiiiiinnnnnddddd")
+INST(ADDS_shift, "ADDS (shifted register)", "z0101011ss0mmmmmiiiiiinnnnnddddd")
+INST(SUB_shift, "SUB (shifted register)", "z1001011ss0mmmmmiiiiiinnnnnddddd")
+INST(SUBS_shift, "SUBS (shifted register)", "z1101011ss0mmmmmiiiiiinnnnnddddd")
+
+// Data Processing - Register - Add/Sub (shifted register)
+INST(ADD_ext, "ADD (extended register)", "z0001011001mmmmmxxxiiinnnnnddddd")
+INST(ADDS_ext, "ADDS (extended register)", "z0101011001mmmmmxxxiiinnnnnddddd")
+INST(SUB_ext, "SUB (extended register)", "z1001011001mmmmmxxxiiinnnnnddddd")
+INST(SUBS_ext, "SUBS (extended register)", "z1101011001mmmmmxxxiiinnnnnddddd")
+
+// Data Processing - Register - Add/Sub (with carry)
+INST(ADC, "ADC", "z0011010000mmmmm000000nnnnnddddd")
+INST(ADCS, "ADCS", "z0111010000mmmmm000000nnnnnddddd")
+INST(SBC, "SBC", "z1011010000mmmmm000000nnnnnddddd")
+INST(SBCS, "SBCS", "z1111010000mmmmm000000nnnnnddddd")
+
+// Data Processing - Register - Conditional compare
+INST(CCMN_reg, "CCMN (register)", "z0111010010mmmmmcccc00nnnnn0ffff")
+INST(CCMP_reg, "CCMP (register)", "z1111010010mmmmmcccc00nnnnn0ffff")
+INST(CCMN_imm, "CCMN (immediate)", "z0111010010iiiiicccc10nnnnn0ffff")
+INST(CCMP_imm, "CCMP (immediate)", "z1111010010iiiiicccc10nnnnn0ffff")
+
+// Data Processing - Register - Conditional select
+INST(CSEL, "CSEL", "z0011010100mmmmmcccc00nnnnnddddd")
+INST(CSINC, "CSINC", "z0011010100mmmmmcccc01nnnnnddddd")
+INST(CSINV, "CSINV", "z1011010100mmmmmcccc00nnnnnddddd")
+INST(CSNEG, "CSNEG", "z1011010100mmmmmcccc01nnnnnddddd")
+
+// Data Processing - Register - 3 source
+INST(MADD, "MADD", "z0011011000mmmmm0aaaaannnnnddddd")
+INST(MSUB, "MSUB", "z0011011000mmmmm1aaaaannnnnddddd")
+INST(SMADDL, "SMADDL", "10011011001mmmmm0aaaaannnnnddddd")
+INST(SMSUBL, "SMSUBL", "10011011001mmmmm1aaaaannnnnddddd")
+INST(SMULH, "SMULH", "10011011010mmmmm011111nnnnnddddd")
+INST(UMADDL, "UMADDL", "10011011101mmmmm0aaaaannnnnddddd")
+INST(UMSUBL, "UMSUBL", "10011011101mmmmm1aaaaannnnnddddd")
+INST(UMULH, "UMULH", "10011011110mmmmm011111nnnnnddddd")
+
+// Data Processing - FP and SIMD - AES
+INST(AESE, "AESE", "0100111000101000010010nnnnnddddd")
+INST(AESD, "AESD", "0100111000101000010110nnnnnddddd")
+INST(AESMC, "AESMC", "0100111000101000011010nnnnnddddd")
+INST(AESIMC, "AESIMC", "0100111000101000011110nnnnnddddd")
+
+// Data Processing - FP and SIMD - SHA
+INST(SHA1C, "SHA1C", "01011110000mmmmm000000nnnnnddddd")
+INST(SHA1P, "SHA1P", "01011110000mmmmm000100nnnnnddddd")
+INST(SHA1M, "SHA1M", "01011110000mmmmm001000nnnnnddddd")
+INST(SHA1SU0, "SHA1SU0", "01011110000mmmmm001100nnnnnddddd")
+INST(SHA256H, "SHA256H", "01011110000mmmmm010000nnnnnddddd")
+INST(SHA256H2, "SHA256H2", "01011110000mmmmm010100nnnnnddddd")
+INST(SHA256SU1, "SHA256SU1", "01011110000mmmmm011000nnnnnddddd")
+INST(SHA1H, "SHA1H", "0101111000101000000010nnnnnddddd")
+INST(SHA1SU1, "SHA1SU1", "0101111000101000000110nnnnnddddd")
+INST(SHA256SU0, "SHA256SU0", "0101111000101000001010nnnnnddddd")
+
+// Data Processing - FP and SIMD - Scalar copy
+INST(DUP_elt_1, "DUP (element)", "01011110000iiiii000001nnnnnddddd")
+
+// Data Processing - FP and SIMD - Scalar three
+//INST(FMULX_vec_1, "FMULX", "01011110010mmmmm000111nnnnnddddd")
+INST(FMULX_vec_2, "FMULX", "010111100z1mmmmm110111nnnnnddddd")
+INST(FCMEQ_reg_1, "FCMEQ (register)", "01011110010mmmmm001001nnnnnddddd")
+INST(FCMEQ_reg_2, "FCMEQ (register)", "010111100z1mmmmm111001nnnnnddddd")
+INST(FRECPS_1, "FRECPS", "01011110010mmmmm001111nnnnnddddd")
+INST(FRECPS_2, "FRECPS", "010111100z1mmmmm111111nnnnnddddd")
+INST(FRSQRTS_1, "FRSQRTS", "01011110110mmmmm001111nnnnnddddd")
+INST(FRSQRTS_2, "FRSQRTS", "010111101z1mmmmm111111nnnnnddddd")
+//INST(FCMGE_reg_1, "FCMGE (register)", "01111110010mmmmm001001nnnnnddddd")
+INST(FCMGE_reg_2, "FCMGE (register)", "011111100z1mmmmm111001nnnnnddddd")
+//INST(FACGE_1, "FACGE", "01111110010mmmmm001011nnnnnddddd")
+INST(FACGE_2, "FACGE", "011111100z1mmmmm111011nnnnnddddd")
+//INST(FABD_1, "FABD", "01111110110mmmmm000101nnnnnddddd")
+INST(FABD_2, "FABD", "011111101z1mmmmm110101nnnnnddddd")
+//INST(FCMGT_reg_1, "FCMGT (register)", "01111110110mmmmm001001nnnnnddddd")
+INST(FCMGT_reg_2, "FCMGT (register)", "011111101z1mmmmm111001nnnnnddddd")
+//INST(FACGT_1, "FACGT", "01111110110mmmmm001011nnnnnddddd")
+INST(FACGT_2, "FACGT", "011111101z1mmmmm111011nnnnnddddd")
+
+// Data Processing - FP and SIMD - Scalar two register misc
+//INST(FCVTNS_1, "FCVTNS (vector)", "0101111001111001101010nnnnnddddd")
+INST(FCVTNS_2, "FCVTNS (vector)", "010111100z100001101010nnnnnddddd")
+//INST(FCVTMS_1, "FCVTMS (vector)", "0101111001111001101110nnnnnddddd")
+INST(FCVTMS_2, "FCVTMS (vector)", "010111100z100001101110nnnnnddddd")
+//INST(FCVTAS_1, "FCVTAS (vector)", "0101111001111001110010nnnnnddddd")
+INST(FCVTAS_2, "FCVTAS (vector)", "010111100z100001110010nnnnnddddd")
+//INST(SCVTF_int_1, "SCVTF (vector, integer)", "0101111001111001110110nnnnnddddd")
+INST(SCVTF_int_2, "SCVTF (vector, integer)", "010111100z100001110110nnnnnddddd")
+//INST(FCMGT_zero_1, "FCMGT (zero)", "0101111011111000110010nnnnnddddd")
+INST(FCMGT_zero_2, "FCMGT (zero)", "010111101z100000110010nnnnnddddd")
+INST(FCMEQ_zero_1, "FCMEQ (zero)", "0101111011111000110110nnnnnddddd")
+INST(FCMEQ_zero_2, "FCMEQ (zero)", "010111101z100000110110nnnnnddddd")
+//INST(FCMLT_1, "FCMLT (zero)", "0101111011111000111010nnnnnddddd")
+INST(FCMLT_2, "FCMLT (zero)", "010111101z100000111010nnnnnddddd")
+//INST(FCVTPS_1, "FCVTPS (vector)", "0101111011111001101010nnnnnddddd")
+INST(FCVTPS_2, "FCVTPS (vector)", "010111101z100001101010nnnnnddddd")
+//INST(FCVTZS_int_1, "FCVTZS (vector, integer)", "0101111011111001101110nnnnnddddd")
+INST(FCVTZS_int_2, "FCVTZS (vector, integer)", "010111101z100001101110nnnnnddddd")
+INST(FRECPE_1, "FRECPE", "0101111011111001110110nnnnnddddd")
+INST(FRECPE_2, "FRECPE", "010111101z100001110110nnnnnddddd")
+INST(FRECPX_1, "FRECPX", "0101111011111001111110nnnnnddddd")
+INST(FRECPX_2, "FRECPX", "010111101z100001111110nnnnnddddd")
+//INST(FCVTNU_1, "FCVTNU (vector)", "0111111001111001101010nnnnnddddd")
+INST(FCVTNU_2, "FCVTNU (vector)", "011111100z100001101010nnnnnddddd")
+//INST(FCVTMU_1, "FCVTMU (vector)", "0111111001111001101110nnnnnddddd")
+INST(FCVTMU_2, "FCVTMU (vector)", "011111100z100001101110nnnnnddddd")
+//INST(FCVTAU_1, "FCVTAU (vector)", "0111111001111001110010nnnnnddddd")
+INST(FCVTAU_2, "FCVTAU (vector)", "011111100z100001110010nnnnnddddd")
+//INST(UCVTF_int_1, "UCVTF (vector, integer)", "0111111001111001110110nnnnnddddd")
+INST(UCVTF_int_2, "UCVTF (vector, integer)", "011111100z100001110110nnnnnddddd")
+//INST(FCMGE_zero_1, "FCMGE (zero)", "0111111011111000110010nnnnnddddd")
+INST(FCMGE_zero_2, "FCMGE (zero)", "011111101z100000110010nnnnnddddd")
+//INST(FCMLE_1, "FCMLE (zero)", "0111111011111000110110nnnnnddddd")
+INST(FCMLE_2, "FCMLE (zero)", "011111101z100000110110nnnnnddddd")
+//INST(FCVTPU_1, "FCVTPU (vector)", "0111111011111001101010nnnnnddddd")
+INST(FCVTPU_2, "FCVTPU (vector)", "011111101z100001101010nnnnnddddd")
+//INST(FCVTZU_int_1, "FCVTZU (vector, integer)", "0111111011111001101110nnnnnddddd")
+INST(FCVTZU_int_2, "FCVTZU (vector, integer)", "011111101z100001101110nnnnnddddd")
+INST(FRSQRTE_1, "FRSQRTE", "0111111011111001110110nnnnnddddd")
+INST(FRSQRTE_2, "FRSQRTE", "011111101z100001110110nnnnnddddd")
+
+// Data Processing - FP and SIMD - Scalar three same extra
+//INST(SQRDMLAH_vec_1, "SQRDMLAH (vector)", "01111110zz0mmmmm100001nnnnnddddd")
+//INST(SQRDMLAH_vec_2, "SQRDMLAH (vector)", "0Q101110zz0mmmmm100001nnnnnddddd")
+//INST(SQRDMLSH_vec_1, "SQRDMLSH (vector)", "01111110zz0mmmmm100011nnnnnddddd")
+//INST(SQRDMLSH_vec_2, "SQRDMLSH (vector)", "0Q101110zz0mmmmm100011nnnnnddddd")
+
+// Data Processing - FP and SIMD - Scalar two-register misc
+INST(SUQADD_1, "SUQADD", "01011110zz100000001110nnnnnddddd")
+INST(SQABS_1, "SQABS", "01011110zz100000011110nnnnnddddd")
+INST(CMGT_zero_1, "CMGT (zero)", "01011110zz100000100010nnnnnddddd")
+INST(CMEQ_zero_1, "CMEQ (zero)", "01011110zz100000100110nnnnnddddd")
+INST(CMLT_1, "CMLT (zero)", "01011110zz100000101010nnnnnddddd")
+INST(ABS_1, "ABS", "01011110zz100000101110nnnnnddddd")
+INST(SQXTN_1, "SQXTN, SQXTN2", "01011110zz100001010010nnnnnddddd")
+INST(USQADD_1, "USQADD", "01111110zz100000001110nnnnnddddd")
+INST(SQNEG_1, "SQNEG", "01111110zz100000011110nnnnnddddd")
+INST(CMGE_zero_1, "CMGE (zero)", "01111110zz100000100010nnnnnddddd")
+INST(CMLE_1, "CMLE (zero)", "01111110zz100000100110nnnnnddddd")
+INST(NEG_1, "NEG (vector)", "01111110zz100000101110nnnnnddddd")
+INST(SQXTUN_1, "SQXTUN, SQXTUN2", "01111110zz100001001010nnnnnddddd")
+INST(UQXTN_1, "UQXTN, UQXTN2", "01111110zz100001010010nnnnnddddd")
+INST(FCVTXN_1, "FCVTXN, FCVTXN2", "011111100z100001011010nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Scalar pairwise
+INST(ADDP_pair, "ADDP (scalar)", "01011110zz110001101110nnnnnddddd")
+//INST(FMAXNMP_pair_1, "FMAXNMP (scalar)", "0101111000110000110010nnnnnddddd")
+INST(FMAXNMP_pair_2, "FMAXNMP (scalar)", "011111100z110000110010nnnnnddddd")
+//INST(FADDP_pair_1, "FADDP (scalar)", "0101111000110000110110nnnnnddddd")
+INST(FADDP_pair_2, "FADDP (scalar)", "011111100z110000110110nnnnnddddd")
+//INST(FMAXP_pair_1, "FMAXP (scalar)", "0101111000110000111110nnnnnddddd")
+INST(FMAXP_pair_2, "FMAXP (scalar)", "011111100z110000111110nnnnnddddd")
+//INST(FMINNMP_pair_1, "FMINNMP (scalar)", "0101111010110000110010nnnnnddddd")
+INST(FMINNMP_pair_2, "FMINNMP (scalar)", "011111101z110000110010nnnnnddddd")
+//INST(FMINP_pair_1, "FMINP (scalar)", "0101111010110000111110nnnnnddddd")
+INST(FMINP_pair_2, "FMINP (scalar)", "011111101z110000111110nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Scalar three different
+//INST(SQDMLAL_vec_1, "SQDMLAL, SQDMLAL2 (vector)", "01011110zz1mmmmm100100nnnnnddddd")
+//INST(SQDMLSL_vec_1, "SQDMLSL, SQDMLSL2 (vector)", "01011110zz1mmmmm101100nnnnnddddd")
+//INST(SQDMULL_vec_1, "SQDMULL, SQDMULL2 (vector)", "01011110zz1mmmmm110100nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Scalar three same
+INST(SQADD_1, "SQADD", "01011110zz1mmmmm000011nnnnnddddd")
+INST(SQSUB_1, "SQSUB", "01011110zz1mmmmm001011nnnnnddddd")
+INST(CMGT_reg_1, "CMGT (register)", "01011110zz1mmmmm001101nnnnnddddd")
+INST(CMGE_reg_1, "CMGE (register)", "01011110zz1mmmmm001111nnnnnddddd")
+INST(SSHL_1, "SSHL", "01011110zz1mmmmm010001nnnnnddddd")
+INST(SQSHL_reg_1, "SQSHL (register)", "01011110zz1mmmmm010011nnnnnddddd")
+INST(SRSHL_1, "SRSHL", "01011110zz1mmmmm010101nnnnnddddd")
+//INST(SQRSHL_1, "SQRSHL", "01011110zz1mmmmm010111nnnnnddddd")
+INST(ADD_1, "ADD (vector)", "01011110zz1mmmmm100001nnnnnddddd")
+INST(CMTST_1, "CMTST", "01011110zz1mmmmm100011nnnnnddddd")
+INST(SQDMULH_vec_1, "SQDMULH (vector)", "01011110zz1mmmmm101101nnnnnddddd")
+INST(UQADD_1, "UQADD", "01111110zz1mmmmm000011nnnnnddddd")
+INST(UQSUB_1, "UQSUB", "01111110zz1mmmmm001011nnnnnddddd")
+INST(CMHI_1, "CMHI (register)", "01111110zz1mmmmm001101nnnnnddddd")
+INST(CMHS_1, "CMHS (register)", "01111110zz1mmmmm001111nnnnnddddd")
+INST(USHL_1, "USHL", "01111110zz1mmmmm010001nnnnnddddd")
+INST(UQSHL_reg_1, "UQSHL (register)", "01111110zz1mmmmm010011nnnnnddddd")
+INST(URSHL_1, "URSHL", "01111110zz1mmmmm010101nnnnnddddd")
+//INST(UQRSHL_1, "UQRSHL", "01111110zz1mmmmm010111nnnnnddddd")
+INST(SUB_1, "SUB (vector)", "01111110zz1mmmmm100001nnnnnddddd")
+INST(CMEQ_reg_1, "CMEQ (register)", "01111110zz1mmmmm100011nnnnnddddd")
+INST(SQRDMULH_vec_1, "SQRDMULH (vector)", "01111110zz1mmmmm101101nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Scalar shift by immediate
+INST(SSHR_1, "SSHR", "010111110IIIIiii000001nnnnnddddd")
+INST(SSRA_1, "SSRA", "010111110IIIIiii000101nnnnnddddd")
+INST(SRSHR_1, "SRSHR", "010111110IIIIiii001001nnnnnddddd")
+INST(SRSRA_1, "SRSRA", "010111110IIIIiii001101nnnnnddddd")
+INST(SHL_1, "SHL", "010111110IIIIiii010101nnnnnddddd")
+INST(SQSHL_imm_1, "SQSHL (immediate)", "010111110IIIIiii011101nnnnnddddd")
+INST(SQSHRN_1, "SQSHRN, SQSHRN2", "010111110IIIIiii100101nnnnnddddd")
+//INST(SQRSHRN_1, "SQRSHRN, SQRSHRN2", "010111110IIIIiii100111nnnnnddddd")
+INST(SCVTF_fix_1, "SCVTF (vector, fixed-point)", "010111110IIIIiii111001nnnnnddddd")
+INST(FCVTZS_fix_1, "FCVTZS (vector, fixed-point)", "010111110IIIIiii111111nnnnnddddd")
+INST(USHR_1, "USHR", "011111110IIIIiii000001nnnnnddddd")
+INST(USRA_1, "USRA", "011111110IIIIiii000101nnnnnddddd")
+INST(URSHR_1, "URSHR", "011111110IIIIiii001001nnnnnddddd")
+INST(URSRA_1, "URSRA", "011111110IIIIiii001101nnnnnddddd")
+INST(SRI_1, "SRI", "011111110IIIIiii010001nnnnnddddd")
+INST(SLI_1, "SLI", "011111110IIIIiii010101nnnnnddddd")
+INST(SQSHLU_1, "SQSHLU", "011111110IIIIiii011001nnnnnddddd")
+INST(UQSHL_imm_1, "UQSHL (immediate)", "011111110IIIIiii011101nnnnnddddd")
+INST(SQSHRUN_1, "SQSHRUN, SQSHRUN2", "011111110IIIIiii100001nnnnnddddd")
+//INST(SQRSHRUN_1, "SQRSHRUN, SQRSHRUN2", "011111110IIIIiii100011nnnnnddddd")
+INST(UQSHRN_1, "UQSHRN, UQSHRN2", "011111110IIIIiii100101nnnnnddddd")
+//INST(UQRSHRN_1, "UQRSHRN, UQRSHRN2", "011111110IIIIiii100111nnnnnddddd")
+INST(UCVTF_fix_1, "UCVTF (vector, fixed-point)", "011111110IIIIiii111001nnnnnddddd")
+INST(FCVTZU_fix_1, "FCVTZU (vector, fixed-point)", "011111110IIIIiii111111nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Scalar x indexed element
+//INST(SQDMLAL_elt_1, "SQDMLAL, SQDMLAL2 (by element)", "01011111zzLMmmmm0011H0nnnnnddddd")
+//INST(SQDMLSL_elt_1, "SQDMLSL, SQDMLSL2 (by element)", "01011111zzLMmmmm0111H0nnnnnddddd")
+INST(SQDMULL_elt_1, "SQDMULL, SQDMULL2 (by element)", "01011111zzLMmmmm1011H0nnnnnddddd")
+INST(SQDMULH_elt_1, "SQDMULH (by element)", "01011111zzLMmmmm1100H0nnnnnddddd")
+INST(SQRDMULH_elt_1, "SQRDMULH (by element)", "01011111zzLMmmmm1101H0nnnnnddddd")
+INST(FMLA_elt_1, "FMLA (by element)", "0101111100LMmmmm0001H0nnnnnddddd")
+INST(FMLA_elt_2, "FMLA (by element)", "010111111zLMmmmm0001H0nnnnnddddd")
+INST(FMLS_elt_1, "FMLS (by element)", "0101111100LMmmmm0101H0nnnnnddddd")
+INST(FMLS_elt_2, "FMLS (by element)", "010111111zLMmmmm0101H0nnnnnddddd")
+//INST(FMUL_elt_1, "FMUL (by element)", "0101111100LMmmmm1001H0nnnnnddddd")
+INST(FMUL_elt_2, "FMUL (by element)", "010111111zLMmmmm1001H0nnnnnddddd")
+//INST(SQRDMLAH_elt_1, "SQRDMLAH (by element)", "01111111zzLMmmmm1101H0nnnnnddddd")
+//INST(SQRDMLSH_elt_1, "SQRDMLSH (by element)", "01111111zzLMmmmm1111H0nnnnnddddd")
+//INST(FMULX_elt_1, "FMULX (by element)", "0111111100LMmmmm1001H0nnnnnddddd")
+INST(FMULX_elt_2, "FMULX (by element)", "011111111zLMmmmm1001H0nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Table Lookup
+INST(TBL, "TBL", "0Q001110000mmmmm0LL000nnnnnddddd")
+INST(TBX, "TBX", "0Q001110000mmmmm0LL100nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Permute
+INST(UZP1, "UZP1", "0Q001110zz0mmmmm000110nnnnnddddd")
+INST(TRN1, "TRN1", "0Q001110zz0mmmmm001010nnnnnddddd")
+INST(ZIP1, "ZIP1", "0Q001110zz0mmmmm001110nnnnnddddd")
+INST(UZP2, "UZP2", "0Q001110zz0mmmmm010110nnnnnddddd")
+INST(TRN2, "TRN2", "0Q001110zz0mmmmm011010nnnnnddddd")
+INST(ZIP2, "ZIP2", "0Q001110zz0mmmmm011110nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Extract
+INST(EXT, "EXT", "0Q101110000mmmmm0iiii0nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Copy
+INST(DUP_elt_2, "DUP (element)", "0Q001110000iiiii000001nnnnnddddd")
+INST(DUP_gen, "DUP (general)", "0Q001110000iiiii000011nnnnnddddd")
+INST(SMOV, "SMOV", "0Q001110000iiiii001011nnnnnddddd")
+INST(UMOV, "UMOV", "0Q001110000iiiii001111nnnnnddddd")
+INST(INS_gen, "INS (general)", "01001110000iiiii000111nnnnnddddd")
+INST(INS_elt, "INS (element)", "01101110000iiiii0iiii1nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Three same
+//INST(FMULX_vec_3, "FMULX", "0Q001110010mmmmm000111nnnnnddddd")
+INST(FCMEQ_reg_3, "FCMEQ (register)", "0Q001110010mmmmm001001nnnnnddddd")
+INST(FRECPS_3, "FRECPS", "0Q001110010mmmmm001111nnnnnddddd")
+INST(FRSQRTS_3, "FRSQRTS", "0Q001110110mmmmm001111nnnnnddddd")
+//INST(FCMGE_reg_3, "FCMGE (register)", "0Q101110010mmmmm001001nnnnnddddd")
+//INST(FACGE_3, "FACGE", "0Q101110010mmmmm001011nnnnnddddd")
+//INST(FABD_3, "FABD", "0Q101110110mmmmm000101nnnnnddddd")
+//INST(FCMGT_reg_3, "FCMGT (register)", "0Q101110110mmmmm001001nnnnnddddd")
+//INST(FACGT_3, "FACGT", "0Q101110110mmmmm001011nnnnnddddd")
+//INST(FMAXNM_1, "FMAXNM (vector)", "0Q001110010mmmmm000001nnnnnddddd")
+INST(FMLA_vec_1, "FMLA (vector)", "0Q001110010mmmmm000011nnnnnddddd")
+//INST(FADD_1, "FADD (vector)", "0Q001110010mmmmm000101nnnnnddddd")
+//INST(FMAX_1, "FMAX (vector)", "0Q001110010mmmmm001101nnnnnddddd")
+//INST(FMINNM_1, "FMINNM (vector)", "0Q001110110mmmmm000001nnnnnddddd")
+INST(FMLS_vec_1, "FMLS (vector)", "0Q001110110mmmmm000011nnnnnddddd")
+//INST(FSUB_1, "FSUB (vector)", "0Q001110110mmmmm000101nnnnnddddd")
+//INST(FMIN_1, "FMIN (vector)", "0Q001110110mmmmm001101nnnnnddddd")
+//INST(FMAXNMP_vec_1, "FMAXNMP (vector)", "0Q101110010mmmmm000001nnnnnddddd")
+//INST(FADDP_vec_1, "FADDP (vector)", "0Q101110010mmmmm000101nnnnnddddd")
+//INST(FMUL_vec_1, "FMUL (vector)", "0Q101110010mmmmm000111nnnnnddddd")
+//INST(FMAXP_vec_1, "FMAXP (vector)", "0Q101110010mmmmm001101nnnnnddddd")
+//INST(FDIV_1, "FDIV (vector)", "0Q101110010mmmmm001111nnnnnddddd")
+//INST(FMINNMP_vec_1, "FMINNMP (vector)", "0Q101110110mmmmm000001nnnnnddddd")
+//INST(FMINP_vec_1, "FMINP (vector)", "0Q101110110mmmmm001101nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Three same extra
+//INST(SMMLA_vec, "SMMLA", "01001110100mmmmm101001nnnnnddddd") // v8.6
+//INST(UMMLA_vec, "UMMLA", "01101110100mmmmm101001nnnnnddddd") // v8.6
+//INST(USMMLA_vec, "USMMLA", "01001110100mmmmm101011nnnnnddddd") // v8.6
+//INST(SUDOT_element, "SUDOT (by element)", "0Q00111100LMmmmm1111H0nnnnnddddd") // v8.6
+//INST(USDOT_element, "USDOT (by_element)", "0Q00111110LMmmmm1111H0nnnnnddddd") // v8.6
+//INST(USDOT_vec, "USDOT (vector)", "0Q001110100mmmmm100111nnnnnddddd") // v8.6
+INST(SDOT_vec, "SDOT (vector)", "0Q001110zz0mmmmm100101nnnnnddddd")
+INST(UDOT_vec, "UDOT (vector)", "0Q101110zz0mmmmm100101nnnnnddddd")
+INST(FCMLA_vec, "FCMLA", "0Q101110zz0mmmmm110rr1nnnnnddddd")
+INST(FCADD_vec, "FCADD", "0Q101110zz0mmmmm111r01nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD Two-register misc
+INST(REV64_asimd, "REV64", "0Q001110zz100000000010nnnnnddddd")
+INST(REV16_asimd, "REV16 (vector)", "0Q001110zz100000000110nnnnnddddd")
+INST(SADDLP, "SADDLP", "0Q001110zz100000001010nnnnnddddd")
+INST(SUQADD_2, "SUQADD", "0Q001110zz100000001110nnnnnddddd")
+INST(CLS_asimd, "CLS (vector)", "0Q001110zz100000010010nnnnnddddd")
+INST(CNT, "CNT", "0Q001110zz100000010110nnnnnddddd")
+INST(SADALP, "SADALP", "0Q001110zz100000011010nnnnnddddd")
+INST(SQABS_2, "SQABS", "0Q001110zz100000011110nnnnnddddd")
+INST(CMGT_zero_2, "CMGT (zero)", "0Q001110zz100000100010nnnnnddddd")
+INST(CMEQ_zero_2, "CMEQ (zero)", "0Q001110zz100000100110nnnnnddddd")
+INST(CMLT_2, "CMLT (zero)", "0Q001110zz100000101010nnnnnddddd")
+INST(ABS_2, "ABS", "0Q001110zz100000101110nnnnnddddd")
+INST(XTN, "XTN, XTN2", "0Q001110zz100001001010nnnnnddddd")
+INST(SQXTN_2, "SQXTN, SQXTN2", "0Q001110zz100001010010nnnnnddddd")
+INST(FCVTN, "FCVTN, FCVTN2", "0Q0011100z100001011010nnnnnddddd")
+INST(FCVTL, "FCVTL, FCVTL2", "0Q0011100z100001011110nnnnnddddd")
+INST(FRINTN_1, "FRINTN (vector)", "0Q00111001111001100010nnnnnddddd")
+INST(FRINTN_2, "FRINTN (vector)", "0Q0011100z100001100010nnnnnddddd")
+INST(FRINTM_1, "FRINTM (vector)", "0Q00111001111001100110nnnnnddddd")
+INST(FRINTM_2, "FRINTM (vector)", "0Q0011100z100001100110nnnnnddddd")
+//INST(FCVTNS_3, "FCVTNS (vector)", "0Q00111001111001101010nnnnnddddd")
+INST(FCVTNS_4, "FCVTNS (vector)", "0Q0011100z100001101010nnnnnddddd")
+//INST(FCVTMS_3, "FCVTMS (vector)", "0Q00111001111001101110nnnnnddddd")
+INST(FCVTMS_4, "FCVTMS (vector)", "0Q0011100z100001101110nnnnnddddd")
+//INST(FCVTAS_3, "FCVTAS (vector)", "0Q00111001111001110010nnnnnddddd")
+INST(FCVTAS_4, "FCVTAS (vector)", "0Q0011100z100001110010nnnnnddddd")
+//INST(SCVTF_int_3, "SCVTF (vector, integer)", "0Q00111001111001110110nnnnnddddd")
+INST(SCVTF_int_4, "SCVTF (vector, integer)", "0Q0011100z100001110110nnnnnddddd")
+//INST(FCMGT_zero_3, "FCMGT (zero)", "0Q00111011111000110010nnnnnddddd")
+INST(FCMGT_zero_4, "FCMGT (zero)", "0Q0011101z100000110010nnnnnddddd")
+INST(FCMEQ_zero_3, "FCMEQ (zero)", "0Q00111011111000110110nnnnnddddd")
+INST(FCMEQ_zero_4, "FCMEQ (zero)", "0Q0011101z100000110110nnnnnddddd")
+//INST(FCMLT_3, "FCMLT (zero)", "0Q00111011111000111010nnnnnddddd")
+INST(FCMLT_4, "FCMLT (zero)", "0Q0011101z100000111010nnnnnddddd")
+INST(FABS_1, "FABS (vector)", "0Q00111011111000111110nnnnnddddd")
+INST(FABS_2, "FABS (vector)", "0Q0011101z100000111110nnnnnddddd")
+INST(FRINTP_1, "FRINTP (vector)", "0Q00111011111001100010nnnnnddddd")
+INST(FRINTP_2, "FRINTP (vector)", "0Q0011101z100001100010nnnnnddddd")
+INST(FRINTZ_1, "FRINTZ (vector)", "0Q00111011111001100110nnnnnddddd")
+INST(FRINTZ_2, "FRINTZ (vector)", "0Q0011101z100001100110nnnnnddddd")
+//INST(FCVTPS_3, "FCVTPS (vector)", "0Q00111011111001101010nnnnnddddd")
+INST(FCVTPS_4, "FCVTPS (vector)", "0Q0011101z100001101010nnnnnddddd")
+//INST(FCVTZS_int_3, "FCVTZS (vector, integer)", "0Q00111011111001101110nnnnnddddd")
+INST(FCVTZS_int_4, "FCVTZS (vector, integer)", "0Q0011101z100001101110nnnnnddddd")
+INST(URECPE, "URECPE", "0Q0011101z100001110010nnnnnddddd")
+INST(FRECPE_3, "FRECPE", "0Q00111011111001110110nnnnnddddd")
+INST(FRECPE_4, "FRECPE", "0Q0011101z100001110110nnnnnddddd")
+INST(REV32_asimd, "REV32 (vector)", "0Q101110zz100000000010nnnnnddddd")
+INST(UADDLP, "UADDLP", "0Q101110zz100000001010nnnnnddddd")
+INST(USQADD_2, "USQADD", "0Q101110zz100000001110nnnnnddddd")
+INST(CLZ_asimd, "CLZ (vector)", "0Q101110zz100000010010nnnnnddddd")
+INST(UADALP, "UADALP", "0Q101110zz100000011010nnnnnddddd")
+INST(SQNEG_2, "SQNEG", "0Q101110zz100000011110nnnnnddddd")
+INST(CMGE_zero_2, "CMGE (zero)", "0Q101110zz100000100010nnnnnddddd")
+INST(CMLE_2, "CMLE (zero)", "0Q101110zz100000100110nnnnnddddd")
+INST(NEG_2, "NEG (vector)", "0Q101110zz100000101110nnnnnddddd")
+INST(SQXTUN_2, "SQXTUN, SQXTUN2", "0Q101110zz100001001010nnnnnddddd")
+INST(SHLL, "SHLL, SHLL2", "0Q101110zz100001001110nnnnnddddd")
+INST(UQXTN_2, "UQXTN, UQXTN2", "0Q101110zz100001010010nnnnnddddd")
+INST(FCVTXN_2, "FCVTXN, FCVTXN2", "0Q1011100z100001011010nnnnnddddd")
+INST(FRINTA_1, "FRINTA (vector)", "0Q10111001111001100010nnnnnddddd")
+INST(FRINTA_2, "FRINTA (vector)", "0Q1011100z100001100010nnnnnddddd")
+INST(FRINTX_1, "FRINTX (vector)", "0Q10111001111001100110nnnnnddddd")
+INST(FRINTX_2, "FRINTX (vector)", "0Q1011100z100001100110nnnnnddddd")
+//INST(FCVTNU_3, "FCVTNU (vector)", "0Q10111001111001101010nnnnnddddd")
+INST(FCVTNU_4, "FCVTNU (vector)", "0Q1011100z100001101010nnnnnddddd")
+//INST(FCVTMU_3, "FCVTMU (vector)", "0Q10111001111001101110nnnnnddddd")
+INST(FCVTMU_4, "FCVTMU (vector)", "0Q1011100z100001101110nnnnnddddd")
+//INST(FCVTAU_3, "FCVTAU (vector)", "0Q10111001111001110010nnnnnddddd")
+INST(FCVTAU_4, "FCVTAU (vector)", "0Q1011100z100001110010nnnnnddddd")
+//INST(UCVTF_int_3, "UCVTF (vector, integer)", "0Q10111001111001110110nnnnnddddd")
+INST(UCVTF_int_4, "UCVTF (vector, integer)", "0Q1011100z100001110110nnnnnddddd")
+INST(NOT, "NOT", "0Q10111000100000010110nnnnnddddd")
+INST(RBIT_asimd, "RBIT (vector)", "0Q10111001100000010110nnnnnddddd")
+INST(FNEG_1, "FNEG (vector)", "0Q10111011111000111110nnnnnddddd")
+INST(FNEG_2, "FNEG (vector)", "0Q1011101z100000111110nnnnnddddd")
+INST(FRINTI_1, "FRINTI (vector)", "0Q10111011111001100110nnnnnddddd")
+INST(FRINTI_2, "FRINTI (vector)", "0Q1011101z100001100110nnnnnddddd")
+//INST(FCMGE_zero_3, "FCMGE (zero)", "0Q10111011111000110010nnnnnddddd")
+INST(FCMGE_zero_4, "FCMGE (zero)", "0Q1011101z100000110010nnnnnddddd")
+//INST(FCMLE_3, "FCMLE (zero)", "0Q10111011111000110110nnnnnddddd")
+INST(FCMLE_4, "FCMLE (zero)", "0Q1011101z100000110110nnnnnddddd")
+//INST(FCVTPU_3, "FCVTPU (vector)", "0Q10111011111001101010nnnnnddddd")
+INST(FCVTPU_4, "FCVTPU (vector)", "0Q1011101z100001101010nnnnnddddd")
+//INST(FCVTZU_int_3, "FCVTZU (vector, integer)", "0Q10111011111001101110nnnnnddddd")
+INST(FCVTZU_int_4, "FCVTZU (vector, integer)", "0Q1011101z100001101110nnnnnddddd")
+INST(URSQRTE, "URSQRTE", "0Q1011101z100001110010nnnnnddddd")
+INST(FRSQRTE_3, "FRSQRTE", "0Q10111011111001110110nnnnnddddd")
+INST(FRSQRTE_4, "FRSQRTE", "0Q1011101z100001110110nnnnnddddd")
+//INST(FSQRT_1, "FSQRT (vector)", "0Q10111011111001111110nnnnnddddd")
+INST(FSQRT_2, "FSQRT (vector)", "0Q1011101z100001111110nnnnnddddd")
+//INST(FRINT32X_1, "FRINT32X (vector)", "0Q1011100z100001111110nnnnnddddd") // ARMv8.5
+//INST(FRINT64X_1, "FRINT64X (vector)", "0Q1011100z100001111010nnnnnddddd") // ARMv8.5
+//INST(FRINT32Z_1, "FRINT32Z (vector)", "0Q0011100z100001111010nnnnnddddd") // ARMv8.5
+//INST(FRINT64Z_1, "FRINT64Z (vector)", "0Q0011100z100001111110nnnnnddddd") // ARMv8.5
+
+// Data Processing - FP and SIMD - SIMD across lanes
+INST(SADDLV, "SADDLV", "0Q001110zz110000001110nnnnnddddd")
+INST(SMAXV, "SMAXV", "0Q001110zz110000101010nnnnnddddd")
+INST(SMINV, "SMINV", "0Q001110zz110001101010nnnnnddddd")
+INST(ADDV, "ADDV", "0Q001110zz110001101110nnnnnddddd")
+//INST(FMAXNMV_1, "FMAXNMV", "0Q00111000110000110010nnnnnddddd")
+INST(FMAXNMV_2, "FMAXNMV", "0Q1011100z110000110010nnnnnddddd")
+//INST(FMAXV_1, "FMAXV", "0Q00111000110000111110nnnnnddddd")
+INST(FMAXV_2, "FMAXV", "0Q1011100z110000111110nnnnnddddd")
+//INST(FMINNMV_1, "FMINNMV", "0Q00111010110000110010nnnnnddddd")
+INST(FMINNMV_2, "FMINNMV", "0Q1011101z110000110010nnnnnddddd")
+//INST(FMINV_1, "FMINV", "0Q00111010110000111110nnnnnddddd")
+INST(FMINV_2, "FMINV", "0Q1011101z110000111110nnnnnddddd")
+INST(UADDLV, "UADDLV", "0Q101110zz110000001110nnnnnddddd")
+INST(UMAXV, "UMAXV", "0Q101110zz110000101010nnnnnddddd")
+INST(UMINV, "UMINV", "0Q101110zz110001101010nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD three different
+INST(SADDL, "SADDL, SADDL2", "0Q001110zz1mmmmm000000nnnnnddddd")
+INST(SADDW, "SADDW, SADDW2", "0Q001110zz1mmmmm000100nnnnnddddd")
+INST(SSUBL, "SSUBL, SSUBL2", "0Q001110zz1mmmmm001000nnnnnddddd")
+INST(SSUBW, "SSUBW, SSUBW2", "0Q001110zz1mmmmm001100nnnnnddddd")
+INST(ADDHN, "ADDHN, ADDHN2", "0Q001110zz1mmmmm010000nnnnnddddd")
+INST(SABAL, "SABAL, SABAL2", "0Q001110zz1mmmmm010100nnnnnddddd")
+INST(SUBHN, "SUBHN, SUBHN2", "0Q001110zz1mmmmm011000nnnnnddddd")
+INST(SABDL, "SABDL, SABDL2", "0Q001110zz1mmmmm011100nnnnnddddd")
+INST(SMLAL_vec, "SMLAL, SMLAL2 (vector)", "0Q001110zz1mmmmm100000nnnnnddddd")
+INST(SMLSL_vec, "SMLSL, SMLSL2 (vector)", "0Q001110zz1mmmmm101000nnnnnddddd")
+INST(SMULL_vec, "SMULL, SMULL2 (vector)", "0Q001110zz1mmmmm110000nnnnnddddd")
+INST(PMULL, "PMULL, PMULL2", "0Q001110zz1mmmmm111000nnnnnddddd")
+INST(UADDL, "UADDL, UADDL2", "0Q101110zz1mmmmm000000nnnnnddddd")
+INST(UADDW, "UADDW, UADDW2", "0Q101110zz1mmmmm000100nnnnnddddd")
+INST(USUBL, "USUBL, USUBL2", "0Q101110zz1mmmmm001000nnnnnddddd")
+INST(USUBW, "USUBW, USUBW2", "0Q101110zz1mmmmm001100nnnnnddddd")
+INST(RADDHN, "RADDHN, RADDHN2", "0Q101110zz1mmmmm010000nnnnnddddd")
+INST(UABAL, "UABAL, UABAL2", "0Q101110zz1mmmmm010100nnnnnddddd")
+INST(RSUBHN, "RSUBHN, RSUBHN2", "0Q101110zz1mmmmm011000nnnnnddddd")
+INST(UABDL, "UABDL, UABDL2", "0Q101110zz1mmmmm011100nnnnnddddd")
+INST(UMLAL_vec, "UMLAL, UMLAL2 (vector)", "0Q101110zz1mmmmm100000nnnnnddddd")
+INST(UMLSL_vec, "UMLSL, UMLSL2 (vector)", "0Q101110zz1mmmmm101000nnnnnddddd")
+INST(UMULL_vec, "UMULL, UMULL2 (vector)", "0Q101110zz1mmmmm110000nnnnnddddd")
+//INST(SQDMLAL_vec_2, "SQDMLAL, SQDMLAL2 (vector)", "0Q001110zz1mmmmm100100nnnnnddddd")
+//INST(SQDMLSL_vec_2, "SQDMLSL, SQDMLSL2 (vector)", "0Q001110zz1mmmmm101100nnnnnddddd")
+INST(SQDMULL_vec_2, "SQDMULL, SQDMULL2 (vector)", "0Q001110zz1mmmmm110100nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD three same
+INST(SHADD, "SHADD", "0Q001110zz1mmmmm000001nnnnnddddd")
+INST(SQADD_2, "SQADD", "0Q001110zz1mmmmm000011nnnnnddddd")
+INST(SRHADD, "SRHADD", "0Q001110zz1mmmmm000101nnnnnddddd")
+INST(SHSUB, "SHSUB", "0Q001110zz1mmmmm001001nnnnnddddd")
+INST(SQSUB_2, "SQSUB", "0Q001110zz1mmmmm001011nnnnnddddd")
+INST(CMGT_reg_2, "CMGT (register)", "0Q001110zz1mmmmm001101nnnnnddddd")
+INST(CMGE_reg_2, "CMGE (register)", "0Q001110zz1mmmmm001111nnnnnddddd")
+INST(SSHL_2, "SSHL", "0Q001110zz1mmmmm010001nnnnnddddd")
+INST(SQSHL_reg_2, "SQSHL (register)", "0Q001110zz1mmmmm010011nnnnnddddd")
+INST(SRSHL_2, "SRSHL", "0Q001110zz1mmmmm010101nnnnnddddd")
+//INST(SQRSHL_2, "SQRSHL", "0Q001110zz1mmmmm010111nnnnnddddd")
+INST(SMAX, "SMAX", "0Q001110zz1mmmmm011001nnnnnddddd")
+INST(SMIN, "SMIN", "0Q001110zz1mmmmm011011nnnnnddddd")
+INST(SABD, "SABD", "0Q001110zz1mmmmm011101nnnnnddddd")
+INST(SABA, "SABA", "0Q001110zz1mmmmm011111nnnnnddddd")
+INST(ADD_vector, "ADD (vector)", "0Q001110zz1mmmmm100001nnnnnddddd")
+INST(CMTST_2, "CMTST", "0Q001110zz1mmmmm100011nnnnnddddd")
+INST(MLA_vec, "MLA (vector)", "0Q001110zz1mmmmm100101nnnnnddddd")
+INST(MUL_vec, "MUL (vector)", "0Q001110zz1mmmmm100111nnnnnddddd")
+INST(SMAXP, "SMAXP", "0Q001110zz1mmmmm101001nnnnnddddd")
+INST(SMINP, "SMINP", "0Q001110zz1mmmmm101011nnnnnddddd")
+INST(SQDMULH_vec_2, "SQDMULH (vector)", "0Q001110zz1mmmmm101101nnnnnddddd")
+INST(ADDP_vec, "ADDP (vector)", "0Q001110zz1mmmmm101111nnnnnddddd")
+INST(FMAXNM_2, "FMAXNM (vector)", "0Q0011100z1mmmmm110001nnnnnddddd")
+INST(FMLA_vec_2, "FMLA (vector)", "0Q0011100z1mmmmm110011nnnnnddddd")
+INST(FADD_2, "FADD (vector)", "0Q0011100z1mmmmm110101nnnnnddddd")
+INST(FMAX_2, "FMAX (vector)", "0Q0011100z1mmmmm111101nnnnnddddd")
+INST(FMULX_vec_4, "FMULX", "0Q0011100z1mmmmm110111nnnnnddddd")
+INST(FCMEQ_reg_4, "FCMEQ (register)", "0Q0011100z1mmmmm111001nnnnnddddd")
+//INST(FMLAL_vec_1, "FMLAL, FMLAL2 (vector)", "0Q0011100z1mmmmm111011nnnnnddddd")
+INST(FRECPS_4, "FRECPS", "0Q0011100z1mmmmm111111nnnnnddddd")
+INST(AND_asimd, "AND (vector)", "0Q001110001mmmmm000111nnnnnddddd")
+INST(BIC_asimd_reg, "BIC (vector, register)", "0Q001110011mmmmm000111nnnnnddddd")
+INST(FMINNM_2, "FMINNM (vector)", "0Q0011101z1mmmmm110001nnnnnddddd")
+INST(FMLS_vec_2, "FMLS (vector)", "0Q0011101z1mmmmm110011nnnnnddddd")
+INST(FSUB_2, "FSUB (vector)", "0Q0011101z1mmmmm110101nnnnnddddd")
+//INST(FMLSL_vec_1, "FMLSL, FMLSL2 (vector)", "0Q0011101z1mmmmm111011nnnnnddddd")
+INST(FMIN_2, "FMIN (vector)", "0Q0011101z1mmmmm111101nnnnnddddd")
+INST(FRSQRTS_4, "FRSQRTS", "0Q0011101z1mmmmm111111nnnnnddddd")
+INST(ORR_asimd_reg, "ORR (vector, register)", "0Q001110101mmmmm000111nnnnnddddd")
+INST(ORN_asimd, "ORN (vector)", "0Q001110111mmmmm000111nnnnnddddd")
+INST(UHADD, "UHADD", "0Q101110zz1mmmmm000001nnnnnddddd")
+INST(UQADD_2, "UQADD", "0Q101110zz1mmmmm000011nnnnnddddd")
+INST(URHADD, "URHADD", "0Q101110zz1mmmmm000101nnnnnddddd")
+INST(UHSUB, "UHSUB", "0Q101110zz1mmmmm001001nnnnnddddd")
+INST(UQSUB_2, "UQSUB", "0Q101110zz1mmmmm001011nnnnnddddd")
+INST(CMHI_2, "CMHI (register)", "0Q101110zz1mmmmm001101nnnnnddddd")
+INST(CMHS_2, "CMHS (register)", "0Q101110zz1mmmmm001111nnnnnddddd")
+INST(USHL_2, "USHL", "0Q101110zz1mmmmm010001nnnnnddddd")
+INST(UQSHL_reg_2, "UQSHL (register)", "0Q101110zz1mmmmm010011nnnnnddddd")
+INST(URSHL_2, "URSHL", "0Q101110zz1mmmmm010101nnnnnddddd")
+//INST(UQRSHL_2, "UQRSHL", "0Q101110zz1mmmmm010111nnnnnddddd")
+INST(UMAX, "UMAX", "0Q101110zz1mmmmm011001nnnnnddddd")
+INST(UMIN, "UMIN", "0Q101110zz1mmmmm011011nnnnnddddd")
+INST(UABD, "UABD", "0Q101110zz1mmmmm011101nnnnnddddd")
+INST(UABA, "UABA", "0Q101110zz1mmmmm011111nnnnnddddd")
+INST(SUB_2, "SUB (vector)", "0Q101110zz1mmmmm100001nnnnnddddd")
+INST(CMEQ_reg_2, "CMEQ (register)", "0Q101110zz1mmmmm100011nnnnnddddd")
+INST(MLS_vec, "MLS (vector)", "0Q101110zz1mmmmm100101nnnnnddddd")
+INST(PMUL, "PMUL", "0Q101110zz1mmmmm100111nnnnnddddd")
+INST(UMAXP, "UMAXP", "0Q101110zz1mmmmm101001nnnnnddddd")
+INST(UMINP, "UMINP", "0Q101110zz1mmmmm101011nnnnnddddd")
+INST(SQRDMULH_vec_2, "SQRDMULH (vector)", "0Q101110zz1mmmmm101101nnnnnddddd")
+INST(FMAXNMP_vec_2, "FMAXNMP (vector)", "0Q1011100z1mmmmm110001nnnnnddddd")
+//INST(FMLAL_vec_2, "FMLAL, FMLAL2 (vector)", "0Q1011100z1mmmmm110011nnnnnddddd")
+INST(FADDP_vec_2, "FADDP (vector)", "0Q1011100z1mmmmm110101nnnnnddddd")
+INST(FMUL_vec_2, "FMUL (vector)", "0Q1011100z1mmmmm110111nnnnnddddd")
+INST(FCMGE_reg_4, "FCMGE (register)", "0Q1011100z1mmmmm111001nnnnnddddd")
+INST(FACGE_4, "FACGE", "0Q1011100z1mmmmm111011nnnnnddddd")
+INST(FMAXP_vec_2, "FMAXP (vector)", "0Q1011100z1mmmmm111101nnnnnddddd")
+INST(FDIV_2, "FDIV (vector)", "0Q1011100z1mmmmm111111nnnnnddddd")
+INST(EOR_asimd, "EOR (vector)", "0Q101110001mmmmm000111nnnnnddddd")
+INST(BSL, "BSL", "0Q101110011mmmmm000111nnnnnddddd")
+INST(FMINNMP_vec_2, "FMINNMP (vector)", "0Q1011101z1mmmmm110001nnnnnddddd")
+//INST(FMLSL_vec_2, "FMLSL, FMLSL2 (vector)", "0Q1011101z1mmmmm110011nnnnnddddd")
+INST(FABD_4, "FABD", "0Q1011101z1mmmmm110101nnnnnddddd")
+INST(FCMGT_reg_4, "FCMGT (register)", "0Q1011101z1mmmmm111001nnnnnddddd")
+INST(FACGT_4, "FACGT", "0Q1011101z1mmmmm111011nnnnnddddd")
+INST(FMINP_vec_2, "FMINP (vector)", "0Q1011101z1mmmmm111101nnnnnddddd")
+INST(BIT, "BIT", "0Q101110101mmmmm000111nnnnnddddd")
+INST(BIF, "BIF", "0Q101110111mmmmm000111nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD modified immediate
+INST(MOVI, "MOVI, MVNI, ORR, BIC (vector, immediate)", "0Qo0111100000abcmmmm01defghddddd")
+INST(FMOV_2, "FMOV (vector, immediate)", "0Qo0111100000abc111101defghddddd")
+INST(FMOV_3, "FMOV (vector, immediate)", "0Q00111100000abc111111defghddddd")
+INST(UnallocatedEncoding, "Unallocated SIMD modified immediate", "0--0111100000-------11----------")
+
+// Data Processing - FP and SIMD - SIMD Shift by immediate
+INST(SSHR_2, "SSHR", "0Q0011110IIIIiii000001nnnnnddddd")
+INST(SSRA_2, "SSRA", "0Q0011110IIIIiii000101nnnnnddddd")
+INST(SRSHR_2, "SRSHR", "0Q0011110IIIIiii001001nnnnnddddd")
+INST(SRSRA_2, "SRSRA", "0Q0011110IIIIiii001101nnnnnddddd")
+INST(SHL_2, "SHL", "0Q0011110IIIIiii010101nnnnnddddd")
+INST(SQSHL_imm_2, "SQSHL (immediate)", "0Q0011110IIIIiii011101nnnnnddddd")
+INST(SHRN, "SHRN, SHRN2", "0Q0011110IIIIiii100001nnnnnddddd")
+INST(RSHRN, "RSHRN, RSHRN2", "0Q0011110IIIIiii100011nnnnnddddd")
+INST(SQSHRN_2, "SQSHRN, SQSHRN2", "0Q0011110IIIIiii100101nnnnnddddd")
+INST(SQRSHRN_2, "SQRSHRN, SQRSHRN2", "0Q0011110IIIIiii100111nnnnnddddd")
+INST(SSHLL, "SSHLL, SSHLL2", "0Q0011110IIIIiii101001nnnnnddddd")
+INST(SCVTF_fix_2, "SCVTF (vector, fixed-point)", "0Q0011110IIIIiii111001nnnnnddddd")
+INST(FCVTZS_fix_2, "FCVTZS (vector, fixed-point)", "0Q0011110IIIIiii111111nnnnnddddd")
+INST(USHR_2, "USHR", "0Q1011110IIIIiii000001nnnnnddddd")
+INST(USRA_2, "USRA", "0Q1011110IIIIiii000101nnnnnddddd")
+INST(URSHR_2, "URSHR", "0Q1011110IIIIiii001001nnnnnddddd")
+INST(URSRA_2, "URSRA", "0Q1011110IIIIiii001101nnnnnddddd")
+INST(SRI_2, "SRI", "0Q1011110IIIIiii010001nnnnnddddd")
+INST(SLI_2, "SLI", "0Q1011110IIIIiii010101nnnnnddddd")
+INST(SQSHLU_2, "SQSHLU", "0Q1011110IIIIiii011001nnnnnddddd")
+INST(UQSHL_imm_2, "UQSHL (immediate)", "0Q1011110IIIIiii011101nnnnnddddd")
+INST(SQSHRUN_2, "SQSHRUN, SQSHRUN2", "0Q1011110IIIIiii100001nnnnnddddd")
+INST(SQRSHRUN_2, "SQRSHRUN, SQRSHRUN2", "0Q1011110IIIIiii100011nnnnnddddd")
+INST(UQSHRN_2, "UQSHRN, UQSHRN2", "0Q1011110IIIIiii100101nnnnnddddd")
+INST(UQRSHRN_2, "UQRSHRN, UQRSHRN2", "0Q1011110IIIIiii100111nnnnnddddd")
+INST(USHLL, "USHLL, USHLL2", "0Q1011110IIIIiii101001nnnnnddddd")
+INST(UCVTF_fix_2, "UCVTF (vector, fixed-point)", "0Q1011110IIIIiii111001nnnnnddddd")
+INST(FCVTZU_fix_2, "FCVTZU (vector, fixed-point)", "0Q1011110IIIIiii111111nnnnnddddd")
+
+// Data Processing - FP and SIMD - SIMD vector x indexed element
+INST(SMLAL_elt, "SMLAL, SMLAL2 (by element)", "0Q001111zzLMmmmm0010H0nnnnnddddd")
+//INST(SQDMLAL_elt_2, "SQDMLAL, SQDMLAL2 (by element)", "0Q001111zzLMmmmm0011H0nnnnnddddd")
+INST(SMLSL_elt, "SMLSL, SMLSL2 (by element)", "0Q001111zzLMmmmm0110H0nnnnnddddd")
+//INST(SQDMLSL_elt_2, "SQDMLSL, SQDMLSL2 (by element)", "0Q001111zzLMmmmm0111H0nnnnnddddd")
+INST(MUL_elt, "MUL (by element)", "0Q001111zzLMmmmm1000H0nnnnnddddd")
+INST(SMULL_elt, "SMULL, SMULL2 (by element)", "0Q001111zzLMmmmm1010H0nnnnnddddd")
+INST(SQDMULL_elt_2, "SQDMULL, SQDMULL2 (by element)", "0Q001111zzLMmmmm1011H0nnnnnddddd")
+INST(SQDMULH_elt_2, "SQDMULH (by element)", "0Q001111zzLMmmmm1100H0nnnnnddddd")
+INST(SQRDMULH_elt_2, "SQRDMULH (by element)", "0Q001111zzLMmmmm1101H0nnnnnddddd")
+INST(SDOT_elt, "SDOT (by element)", "0Q001111zzLMmmmm1110H0nnnnnddddd")
+INST(FMLA_elt_3, "FMLA (by element)", "0Q00111100LMmmmm0001H0nnnnnddddd")
+INST(FMLA_elt_4, "FMLA (by element)", "0Q0011111zLMmmmm0001H0nnnnnddddd")
+INST(FMLS_elt_3, "FMLS (by element)", "0Q00111100LMmmmm0101H0nnnnnddddd")
+INST(FMLS_elt_4, "FMLS (by element)", "0Q0011111zLMmmmm0101H0nnnnnddddd")
+//INST(FMUL_elt_3, "FMUL (by element)", "0Q00111100LMmmmm1001H0nnnnnddddd")
+INST(FMUL_elt_4, "FMUL (by element)", "0Q0011111zLMmmmm1001H0nnnnnddddd")
+//INST(FMLAL_elt_1, "FMLAL, FMLAL2 (by element)", "0Q0011111zLMmmmm0000H0nnnnnddddd")
+//INST(FMLAL_elt_2, "FMLAL, FMLAL2 (by element)", "0Q1011111zLMmmmm1000H0nnnnnddddd")
+//INST(FMLSL_elt_1, "FMLSL, FMLSL2 (by element)", "0Q0011111zLMmmmm0100H0nnnnnddddd")
+//INST(FMLSL_elt_2, "FMLSL, FMLSL2 (by element)", "0Q1011111zLMmmmm1100H0nnnnnddddd")
+INST(MLA_elt, "MLA (by element)", "0Q101111zzLMmmmm0000H0nnnnnddddd")
+INST(UMLAL_elt, "UMLAL, UMLAL2 (by element)", "0Q101111zzLMmmmm0010H0nnnnnddddd")
+INST(MLS_elt, "MLS (by element)", "0Q101111zzLMmmmm0100H0nnnnnddddd")
+INST(UMLSL_elt, "UMLSL, UMLSL2 (by element)", "0Q101111zzLMmmmm0110H0nnnnnddddd")
+INST(UMULL_elt, "UMULL, UMULL2 (by element)", "0Q101111zzLMmmmm1010H0nnnnnddddd")
+//INST(SQRDMLAH_elt_2, "SQRDMLAH (by element)", "0Q101111zzLMmmmm1101H0nnnnnddddd")
+INST(UDOT_elt, "UDOT (by element)", "0Q101111zzLMmmmm1110H0nnnnnddddd")
+//INST(SQRDMLSH_elt_2, "SQRDMLSH (by element)", "0Q101111zzLMmmmm1111H0nnnnnddddd")
+//INST(FMULX_elt_3, "FMULX (by element)", "0Q10111100LMmmmm1001H0nnnnnddddd")
+INST(FMULX_elt_4, "FMULX (by element)", "0Q1011111zLMmmmm1001H0nnnnnddddd")
+INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd")
+
+// Data Processing - FP and SIMD - Cryptographic three register
+INST(SM3TT1A, "SM3TT1A", "11001110010mmmmm10ii00nnnnnddddd")
+INST(SM3TT1B, "SM3TT1B", "11001110010mmmmm10ii01nnnnnddddd")
+INST(SM3TT2A, "SM3TT2A", "11001110010mmmmm10ii10nnnnnddddd")
+INST(SM3TT2B, "SM3TT2B", "11001110010mmmmm10ii11nnnnnddddd")
+
+// Data Processing - FP and SIMD - SHA512 three register
+INST(SHA512H, "SHA512H", "11001110011mmmmm100000nnnnnddddd")
+INST(SHA512H2, "SHA512H2", "11001110011mmmmm100001nnnnnddddd")
+INST(SHA512SU1, "SHA512SU1", "11001110011mmmmm100010nnnnnddddd")
+INST(RAX1, "RAX1", "11001110011mmmmm100011nnnnnddddd")
+INST(SM3PARTW1, "SM3PARTW1", "11001110011mmmmm110000nnnnnddddd")
+INST(SM3PARTW2, "SM3PARTW2", "11001110011mmmmm110001nnnnnddddd")
+INST(SM4EKEY, "SM4EKEY", "11001110011mmmmm110010nnnnnddddd")
+INST(XAR, "XAR", "11001110100mmmmmiiiiiinnnnnddddd")
+
+// Data Processing - FP and SIMD - Cryptographic four register
+INST(EOR3, "EOR3", "11001110000mmmmm0aaaaannnnnddddd")
+INST(BCAX, "BCAX", "11001110001mmmmm0aaaaannnnnddddd")
+INST(SM3SS1, "SM3SS1", "11001110010mmmmm0aaaaannnnnddddd")
+
+// Data Processing - FP and SIMD - SHA512 two register
+INST(SHA512SU0, "SHA512SU0", "1100111011000000100000nnnnnddddd")
+INST(SM4E, "SM4E", "1100111011000000100001nnnnnddddd")
+
+// Data Processing - FP and SIMD - Conversion between floating point and fixed point
+INST(SCVTF_float_fix, "SCVTF (scalar, fixed-point)", "z0011110yy000010ppppppnnnnnddddd")
+INST(UCVTF_float_fix, "UCVTF (scalar, fixed-point)", "z0011110yy000011ppppppnnnnnddddd")
+INST(FCVTZS_float_fix, "FCVTZS (scalar, fixed-point)", "z0011110yy011000ppppppnnnnnddddd")
+INST(FCVTZU_float_fix, "FCVTZU (scalar, fixed-point)", "z0011110yy011001ppppppnnnnnddddd")
+
+// Data Processing - FP and SIMD - Conversion between floating point and integer
+INST(FCVTNS_float, "FCVTNS (scalar)", "z0011110yy100000000000nnnnnddddd")
+INST(FCVTNU_float, "FCVTNU (scalar)", "z0011110yy100001000000nnnnnddddd")
+INST(SCVTF_float_int, "SCVTF (scalar, integer)", "z0011110yy100010000000nnnnnddddd")
+INST(UCVTF_float_int, "UCVTF (scalar, integer)", "z0011110yy100011000000nnnnnddddd")
+INST(FCVTAS_float, "FCVTAS (scalar)", "z0011110yy100100000000nnnnnddddd")
+INST(FCVTAU_float, "FCVTAU (scalar)", "z0011110yy100101000000nnnnnddddd")
+INST(FMOV_float_gen, "FMOV (general)", "z0011110yy10r11o000000nnnnnddddd")
+INST(FCVTPS_float, "FCVTPS (scalar)", "z0011110yy101000000000nnnnnddddd")
+INST(FCVTPU_float, "FCVTPU (scalar)", "z0011110yy101001000000nnnnnddddd")
+INST(FCVTMS_float, "FCVTMS (scalar)", "z0011110yy110000000000nnnnnddddd")
+INST(FCVTMU_float, "FCVTMU (scalar)", "z0011110yy110001000000nnnnnddddd")
+INST(FCVTZS_float_int, "FCVTZS (scalar, integer)", "z0011110yy111000000000nnnnnddddd")
+INST(FCVTZU_float_int, "FCVTZU (scalar, integer)", "z0011110yy111001000000nnnnnddddd")
+//INST(FJCVTZS, "FJCVTZS", "0001111001111110000000nnnnnddddd")
+
+// Data Processing - FP and SIMD - Floating point data processing
+INST(FMOV_float, "FMOV (register)", "00011110yy100000010000nnnnnddddd")
+INST(FABS_float, "FABS (scalar)", "00011110yy100000110000nnnnnddddd")
+INST(FNEG_float, "FNEG (scalar)", "00011110yy100001010000nnnnnddddd")
+INST(FSQRT_float, "FSQRT (scalar)", "00011110yy100001110000nnnnnddddd")
+INST(FCVT_float, "FCVT", "00011110yy10001oo10000nnnnnddddd")
+INST(FRINTN_float, "FRINTN (scalar)", "00011110yy100100010000nnnnnddddd")
+INST(FRINTP_float, "FRINTP (scalar)", "00011110yy100100110000nnnnnddddd")
+INST(FRINTM_float, "FRINTM (scalar)", "00011110yy100101010000nnnnnddddd")
+INST(FRINTZ_float, "FRINTZ (scalar)", "00011110yy100101110000nnnnnddddd")
+INST(FRINTA_float, "FRINTA (scalar)", "00011110yy100110010000nnnnnddddd")
+INST(FRINTX_float, "FRINTX (scalar)", "00011110yy100111010000nnnnnddddd")
+INST(FRINTI_float, "FRINTI (scalar)", "00011110yy100111110000nnnnnddddd")
+//INST(FRINT32X_float, "FRINT32X (scalar)", "00011110yy101000110000nnnnnddddd") // ARMv8.5
+//INST(FRINT64X_float, "FRINT64X (scalar)", "00011110yy101001110000nnnnnddddd") // ARMv8.5
+//INST(FRINT32Z_float, "FRINT32Z (scalar)", "00011110yy101000010000nnnnnddddd") // ARMv8.5
+//INST(FRINT64Z_float, "FRINT64Z (scalar)", "00011110yy101001010000nnnnnddddd") // ARMv8.5
+
+// Data Processing - FP and SIMD - Floating point compare
+INST(FCMP_float, "FCMP", "00011110yy1mmmmm001000nnnnn0o000")
+INST(FCMPE_float, "FCMPE", "00011110yy1mmmmm001000nnnnn1o000")
+
+// Data Processing - FP and SIMD - Floating point immediate
+INST(FMOV_float_imm, "FMOV (scalar, immediate)", "00011110yy1iiiiiiii10000000ddddd")
+
+// Data Processing - FP and SIMD - Floating point conditional compare
+INST(FCCMP_float, "FCCMP", "00011110yy1mmmmmcccc01nnnnn0ffff")
+INST(FCCMPE_float, "FCCMPE", "00011110yy1mmmmmcccc01nnnnn1ffff")
+
+// Data Processing - FP and SIMD - Floating point data processing two register
+INST(FMUL_float, "FMUL (scalar)", "00011110yy1mmmmm000010nnnnnddddd")
+INST(FDIV_float, "FDIV (scalar)", "00011110yy1mmmmm000110nnnnnddddd")
+INST(FADD_float, "FADD (scalar)", "00011110yy1mmmmm001010nnnnnddddd")
+INST(FSUB_float, "FSUB (scalar)", "00011110yy1mmmmm001110nnnnnddddd")
+INST(FMAX_float, "FMAX (scalar)", "00011110yy1mmmmm010010nnnnnddddd")
+INST(FMIN_float, "FMIN (scalar)", "00011110yy1mmmmm010110nnnnnddddd")
+INST(FMAXNM_float, "FMAXNM (scalar)", "00011110yy1mmmmm011010nnnnnddddd")
+INST(FMINNM_float, "FMINNM (scalar)", "00011110yy1mmmmm011110nnnnnddddd")
+INST(FNMUL_float, "FNMUL (scalar)", "00011110yy1mmmmm100010nnnnnddddd")
+
+// Data Processing - FP and SIMD - Floating point conditional select
+INST(FCSEL_float, "FCSEL", "00011110yy1mmmmmcccc11nnnnnddddd")
+
+// Data Processing - FP and SIMD - Floating point data processing three register
+INST(FMADD_float, "FMADD", "00011111yy0mmmmm0aaaaannnnnddddd")
+INST(FMSUB_float, "FMSUB", "00011111yy0mmmmm1aaaaannnnnddddd")
+INST(FNMADD_float, "FNMADD", "00011111yy1mmmmm0aaaaannnnnddddd")
+INST(FNMSUB_float, "FNMSUB", "00011111yy1mmmmm1aaaaannnnnddddd")
+
+// BFloat16
+//INST(BFCVT, "BFCVT", "0001111001100011010000nnnnnddddd") // v8.6
+//INST(BFCVTN, "BFCVTN{2}", "0Q00111010100001011010nnnnnddddd") // v8.6
+//INST(BFDOT_element, "BFDOT (by element)", "0Q00111101LMmmmm1111H0nnnnnddddd") // v8.6
+//INST(BFDOT_vec, "BFDOT (vector)", "0Q101110010mmmmm111111nnnnnddddd") // v8.6
+//INST(BFMLALX_element, "BFMLALX (by element)", "0Q00111111LMmmmm1111H0nnnnnddddd") // v8.6
+//INST(BFMLALX_vector, "BFMLALX (vector)", "0Q101110110mmmmm111111nnnnnddddd") // v8.6
+//INST(BFMMLA, "BFMMLA", "01101110010mmmmm111011nnnnnddddd") // v8.6
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp
new file mode 100644
index 0000000000..05996aeb64
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp
@@ -0,0 +1,69 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/a64_translate.h"
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/decoder/a64.h"
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/terminal.h"
+
+namespace Dynarmic::A64 {
+
+IR::Block Translate(LocationDescriptor descriptor, MemoryReadCodeFuncType memory_read_code, TranslationOptions options) {
+ const bool single_step = descriptor.SingleStepping();
+
+ IR::Block block{descriptor};
+ TranslatorVisitor visitor{block, descriptor, std::move(options)};
+
+ bool should_continue = true;
+ do {
+ const u64 pc = visitor.ir.current_location->PC();
+
+ if (const auto instruction = memory_read_code(pc)) {
+ if (auto decoder = Decode<TranslatorVisitor>(*instruction)) {
+ should_continue = decoder->get().call(visitor, *instruction);
+ } else {
+ should_continue = visitor.InterpretThisInstruction();
+ }
+ } else {
+ should_continue = visitor.RaiseException(Exception::NoExecuteFault);
+ }
+
+ visitor.ir.current_location = visitor.ir.current_location->AdvancePC(4);
+ block.CycleCount()++;
+ } while (should_continue && !single_step);
+
+ if (single_step && should_continue) {
+ visitor.ir.SetTerm(IR::Term::LinkBlock{*visitor.ir.current_location});
+ }
+
+ ASSERT_MSG(block.HasTerminal(), "Terminal has not been set");
+
+ block.SetEndLocation(*visitor.ir.current_location);
+
+ return block;
+}
+
+bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction) {
+ TranslatorVisitor visitor{block, descriptor, {}};
+
+ bool should_continue = true;
+ if (auto decoder = Decode<TranslatorVisitor>(instruction)) {
+ should_continue = decoder->get().call(visitor, instruction);
+ } else {
+ should_continue = visitor.InterpretThisInstruction();
+ }
+
+ visitor.ir.current_location = visitor.ir.current_location->AdvancePC(4);
+ block.CycleCount()++;
+
+ block.SetEndLocation(*visitor.ir.current_location);
+
+ return should_continue;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h
new file mode 100644
index 0000000000..2f62ac6e50
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h
@@ -0,0 +1,59 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+#pragma once
+
+#include <functional>
+#include <optional>
+
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic {
+
+namespace IR {
+class Block;
+} // namespace IR
+
+namespace A64 {
+
+class LocationDescriptor;
+
+using MemoryReadCodeFuncType = std::function<std::optional<u32>(u64 vaddr)>;
+
+struct TranslationOptions {
+ /// This changes what IR we emit when we translate an unpredictable instruction.
+ /// If this is false, the ExceptionRaised IR instruction is emitted.
+ /// If this is true, we define some behaviour for some instructions.
+ bool define_unpredictable_behaviour = false;
+
+ /// This tells the translator a wall clock will be used, thus allowing it
+ /// to avoid writting certain unnecessary code only needed for cycle timers.
+ bool wall_clock_cntpct = false;
+
+ /// This changes what IR we emit when we translate a hint instruction.
+ /// If this is false, we treat the instruction as a NOP.
+ /// If this is true, we emit an ExceptionRaised instruction.
+ bool hook_hint_instructions = true;
+};
+
+/**
+ * This function translates instructions in memory into our intermediate representation.
+ * @param descriptor The starting location of the basic block. Includes information like PC, FPCR state, &c.
+ * @param memory_read_code The function we should use to read emulated memory.
+ * @param options Configures how certain instructions are translated.
+ * @return A translated basic block in the intermediate representation.
+ */
+IR::Block Translate(LocationDescriptor descriptor, MemoryReadCodeFuncType memory_read_code, TranslationOptions options);
+
+/**
+ * This function translates a single provided instruction into our intermediate representation.
+ * @param block The block to append the IR for the instruction to.
+ * @param descriptor The location of the instruction. Includes information like PC, FPCR state, &c.
+ * @param instruction The instruction to translate.
+ * @return The translated instruction translated to the intermediate representation.
+ */
+bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction);
+
+} // namespace A64
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp
new file mode 100644
index 0000000000..01cc1390c7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp
@@ -0,0 +1,128 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::B_cond(Imm<19> imm19, Cond cond) {
+ const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+ const u64 target = ir.PC() + offset;
+
+ const auto cond_pass = IR::Term::LinkBlock{ir.current_location->SetPC(target)};
+ const auto cond_fail = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)};
+ ir.SetTerm(IR::Term::If{cond, cond_pass, cond_fail});
+ return false;
+}
+
+bool TranslatorVisitor::B_uncond(Imm<26> imm26) {
+ const s64 offset = concatenate(imm26, Imm<2>{0}).SignExtend<s64>();
+ const u64 target = ir.PC() + offset;
+
+ ir.SetTerm(IR::Term::LinkBlock{ir.current_location->SetPC(target)});
+ return false;
+}
+
+bool TranslatorVisitor::BL(Imm<26> imm26) {
+ const s64 offset = concatenate(imm26, Imm<2>{0}).SignExtend<s64>();
+
+ X(64, Reg::R30, ir.Imm64(ir.PC() + 4));
+ ir.PushRSB(ir.current_location->AdvancePC(4));
+
+ const u64 target = ir.PC() + offset;
+ ir.SetTerm(IR::Term::LinkBlock{ir.current_location->SetPC(target)});
+ return false;
+}
+
+bool TranslatorVisitor::BLR(Reg Rn) {
+ const auto target = X(64, Rn);
+
+ X(64, Reg::R30, ir.Imm64(ir.PC() + 4));
+ ir.PushRSB(ir.current_location->AdvancePC(4));
+
+ ir.SetPC(target);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+bool TranslatorVisitor::BR(Reg Rn) {
+ const auto target = X(64, Rn);
+
+ ir.SetPC(target);
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+}
+
+bool TranslatorVisitor::RET(Reg Rn) {
+ const auto target = X(64, Rn);
+
+ ir.SetPC(target);
+ ir.SetTerm(IR::Term::PopRSBHint{});
+ return false;
+}
+
+bool TranslatorVisitor::CBZ(bool sf, Imm<19> imm19, Reg Rt) {
+ const size_t datasize = sf ? 64 : 32;
+ const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+
+ const IR::U32U64 operand1 = X(datasize, Rt);
+
+ ir.SetCheckBit(ir.IsZero(operand1));
+
+ const u64 target = ir.PC() + offset;
+ const auto cond_pass = IR::Term::LinkBlock{ir.current_location->SetPC(target)};
+ const auto cond_fail = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)};
+ ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail});
+ return false;
+}
+
+bool TranslatorVisitor::CBNZ(bool sf, Imm<19> imm19, Reg Rt) {
+ const size_t datasize = sf ? 64 : 32;
+ const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+
+ const IR::U32U64 operand1 = X(datasize, Rt);
+
+ ir.SetCheckBit(ir.IsZero(operand1));
+
+ const u64 target = ir.PC() + offset;
+ const auto cond_pass = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)};
+ const auto cond_fail = IR::Term::LinkBlock{ir.current_location->SetPC(target)};
+ ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail});
+ return false;
+}
+
+bool TranslatorVisitor::TBZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt) {
+ const size_t datasize = b5 == 1 ? 64 : 32;
+ const u8 bit_pos = concatenate(b5, b40).ZeroExtend<u8>();
+ const s64 offset = concatenate(imm14, Imm<2>{0}).SignExtend<s64>();
+
+ const auto operand = X(datasize, Rt);
+
+ ir.SetCheckBit(ir.TestBit(operand, ir.Imm8(bit_pos)));
+
+ const u64 target = ir.PC() + offset;
+ const auto cond_1 = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)};
+ const auto cond_0 = IR::Term::LinkBlock{ir.current_location->SetPC(target)};
+ ir.SetTerm(IR::Term::CheckBit{cond_1, cond_0});
+ return false;
+}
+
+bool TranslatorVisitor::TBNZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt) {
+ const size_t datasize = b5 == 1 ? 64 : 32;
+ const u8 bit_pos = concatenate(b5, b40).ZeroExtend<u8>();
+ const s64 offset = concatenate(imm14, Imm<2>{0}).SignExtend<s64>();
+
+ const auto operand = X(datasize, Rt);
+
+ ir.SetCheckBit(ir.TestBit(operand, ir.Imm8(bit_pos)));
+
+ const u64 target = ir.PC() + offset;
+ const auto cond_1 = IR::Term::LinkBlock{ir.current_location->SetPC(target)};
+ const auto cond_0 = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)};
+ ir.SetTerm(IR::Term::CheckBit{cond_1, cond_0});
+ return false;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp
new file mode 100644
index 0000000000..460ebfa6f8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp
@@ -0,0 +1,22 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::BRK(Imm<16> /*imm16*/) {
+ return RaiseException(Exception::Breakpoint);
+}
+
+bool TranslatorVisitor::SVC(Imm<16> imm16) {
+ ir.PushRSB(ir.current_location->AdvancePC(4));
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.CallSupervisor(imm16.ZeroExtend());
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}});
+ return false;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp
new file mode 100644
index 0000000000..5df4fe1ce9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp
@@ -0,0 +1,330 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::ADD_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) {
+ u64 imm;
+ switch (shift.ZeroExtend()) {
+ case 0b00:
+ imm = imm12.ZeroExtend<u64>();
+ break;
+ case 0b01:
+ imm = imm12.ZeroExtend<u64>() << 12;
+ break;
+ default:
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+
+ const auto result = ir.Add(operand1, I(datasize, imm));
+
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::ADDS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) {
+ u64 imm;
+ switch (shift.ZeroExtend()) {
+ case 0b00:
+ imm = imm12.ZeroExtend<u64>();
+ break;
+ case 0b01:
+ imm = imm12.ZeroExtend<u64>() << 12;
+ break;
+ default:
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+
+ const auto result = ir.Add(operand1, I(datasize, imm));
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUB_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) {
+ u64 imm;
+ switch (shift.ZeroExtend()) {
+ case 0b00:
+ imm = imm12.ZeroExtend<u64>();
+ break;
+ case 0b01:
+ imm = imm12.ZeroExtend<u64>() << 12;
+ break;
+ default:
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+
+ const auto result = ir.Sub(operand1, I(datasize, imm));
+
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::SUBS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) {
+ u64 imm;
+ switch (shift.ZeroExtend()) {
+ case 0b00:
+ imm = imm12.ZeroExtend<u64>();
+ break;
+ case 0b01:
+ imm = imm12.ZeroExtend<u64>() << 12;
+ break;
+ default:
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+
+ const auto result = ir.Sub(operand1, I(datasize, imm));
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADD_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (shift == 0b11) {
+ return ReservedValue();
+ }
+
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+
+ const auto result = ir.Add(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (shift == 0b11) {
+ return ReservedValue();
+ }
+
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+
+ const auto result = ir.Add(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUB_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (shift == 0b11) {
+ return ReservedValue();
+ }
+
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+
+ const auto result = ir.Sub(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUBS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (shift == 0b11) {
+ return ReservedValue();
+ }
+
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+
+ const auto result = ir.Sub(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADD_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) {
+ const u8 shift = imm3.ZeroExtend<u8>();
+ if (shift > 4) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+ const auto operand2 = ExtendReg(datasize, Rm, option, shift);
+
+ const auto result = ir.Add(operand1, operand2);
+
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::ADDS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) {
+ const u8 shift = imm3.ZeroExtend<u8>();
+ if (shift > 4) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+ const auto operand2 = ExtendReg(datasize, Rm, option, shift);
+
+ const auto result = ir.Add(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUB_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) {
+ const u8 shift = imm3.ZeroExtend<u8>();
+ if (shift > 4) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+ const auto operand2 = ExtendReg(datasize, Rm, option, shift);
+
+ const auto result = ir.Sub(operand1, operand2);
+
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::SUBS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) {
+ const u8 shift = imm3.ZeroExtend<u8>();
+ if (shift > 4) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn));
+ const auto operand2 = ExtendReg(datasize, Rm, option, shift);
+
+ const auto result = ir.Sub(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADC(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const auto result = ir.AddWithCarry(operand1, operand2, ir.GetCFlag());
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADCS(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const auto result = ir.AddWithCarry(operand1, operand2, ir.GetCFlag());
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SBC(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const auto result = ir.SubWithCarry(operand1, operand2, ir.GetCFlag());
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SBCS(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const auto result = ir.SubWithCarry(operand1, operand2, ir.GetCFlag());
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp
new file mode 100644
index 0000000000..aec6729ad8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp
@@ -0,0 +1,155 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::SBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (sf && !N) {
+ return ReservedValue();
+ }
+
+ if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) {
+ return ReservedValue();
+ }
+
+ const u8 R = immr.ZeroExtend<u8>();
+ const u8 S = imms.ZeroExtend<u8>();
+ const auto masks = DecodeBitMasks(N, imms, immr, false);
+ if (!masks) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto src = X(datasize, Rn);
+
+ auto bot = ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask));
+ auto top = ir.ReplicateBit(src, S);
+
+ top = ir.And(top, I(datasize, ~masks->tmask));
+ bot = ir.And(bot, I(datasize, masks->tmask));
+
+ X(datasize, Rd, ir.Or(top, bot));
+ return true;
+}
+
+bool TranslatorVisitor::BFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (sf && !N) {
+ return ReservedValue();
+ }
+
+ if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) {
+ return ReservedValue();
+ }
+
+ const u8 R = immr.ZeroExtend<u8>();
+ const auto masks = DecodeBitMasks(N, imms, immr, false);
+ if (!masks) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto dst = X(datasize, Rd);
+ const auto src = X(datasize, Rn);
+
+ const auto bot = ir.Or(ir.And(dst, I(datasize, ~masks->wmask)), ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask)));
+
+ X(datasize, Rd, ir.Or(ir.And(dst, I(datasize, ~masks->tmask)), ir.And(bot, I(datasize, masks->tmask))));
+ return true;
+}
+
+bool TranslatorVisitor::UBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (sf && !N) {
+ return ReservedValue();
+ }
+
+ if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) {
+ return ReservedValue();
+ }
+
+ const u8 R = immr.ZeroExtend<u8>();
+ const auto masks = DecodeBitMasks(N, imms, immr, false);
+ if (!masks) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto src = X(datasize, Rn);
+ const auto bot = ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask));
+
+ X(datasize, Rd, ir.And(bot, I(datasize, masks->tmask)));
+ return true;
+}
+
+bool TranslatorVisitor::ASR_1(Imm<5> immr, Reg Rn, Reg Rd) {
+ const auto src = X(32, Rn);
+ const auto result = ir.ArithmeticShiftRightMasked(src, ir.Imm32(immr.ZeroExtend<u32>()));
+ X(32, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ASR_2(Imm<6> immr, Reg Rn, Reg Rd) {
+ const auto src = X(64, Rn);
+ const auto result = ir.ArithmeticShiftRightMasked(src, ir.Imm64(immr.ZeroExtend<u64>()));
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SXTB_1(Reg Rn, Reg Rd) {
+ const auto src = X(32, Rn);
+ const auto result = ir.SignExtendToWord(ir.LeastSignificantByte(src));
+ X(32, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SXTB_2(Reg Rn, Reg Rd) {
+ const auto src = X(64, Rn);
+ const auto result = ir.SignExtendToLong(ir.LeastSignificantByte(src));
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SXTH_1(Reg Rn, Reg Rd) {
+ const auto src = X(32, Rn);
+ const auto result = ir.SignExtendToWord(ir.LeastSignificantHalf(src));
+ X(32, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SXTH_2(Reg Rn, Reg Rd) {
+ const auto src = X(64, Rn);
+ const auto result = ir.SignExtendToLong(ir.LeastSignificantHalf(src));
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SXTW(Reg Rn, Reg Rd) {
+ const auto src = X(64, Rn);
+ const auto result = ir.SignExtendToLong(ir.LeastSignificantWord(src));
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::EXTR(bool sf, bool N, Reg Rm, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (N != sf) {
+ return UnallocatedEncoding();
+ }
+
+ if (!sf && imms.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 m = X(datasize, Rm);
+ const IR::U32U64 n = X(datasize, Rn);
+ const IR::U32U64 result = ir.ExtractRegister(m, n, ir.Imm8(imms.ZeroExtend<u8>()));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp
new file mode 100644
index 0000000000..03725c262d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp
@@ -0,0 +1,62 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::CCMN_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv) {
+ const size_t datasize = sf ? 64 : 32;
+ const u32 flags = nzcv.ZeroExtend<u32>() << 28;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, operand2, ir.Imm1(0)));
+ const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
+ ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
+ return true;
+}
+
+bool TranslatorVisitor::CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv) {
+ const size_t datasize = sf ? 64 : 32;
+ const u32 flags = nzcv.ZeroExtend<u32>() << 28;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
+ const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
+ ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
+ return true;
+}
+
+bool TranslatorVisitor::CCMN_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv) {
+ const size_t datasize = sf ? 64 : 32;
+ const u32 flags = nzcv.ZeroExtend<u32>() << 28;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>());
+
+ const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, operand2, ir.Imm1(0)));
+ const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
+ ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
+ return true;
+}
+
+bool TranslatorVisitor::CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv) {
+ const size_t datasize = sf ? 64 : 32;
+ const u32 flags = nzcv.ZeroExtend<u32>() << 28;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>());
+
+ const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
+ const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
+ ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp
new file mode 100644
index 0000000000..49dadaa3be
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp
@@ -0,0 +1,58 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::CSEL(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CSINC(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Add(operand2, I(datasize, 1)));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CSINV(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Not(operand2));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CSNEG(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand1 = X(datasize, Rn);
+ const IR::U32U64 operand2 = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Add(ir.Not(operand2), I(datasize, 1)));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp
new file mode 100644
index 0000000000..a0bf932a2a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp
@@ -0,0 +1,76 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::CRC32(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd) {
+ const u32 integral_size = sz.ZeroExtend();
+
+ if (sf && integral_size != 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ if (!sf && integral_size == 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32 result = [&] {
+ const size_t datasize = sf ? 64 : 32;
+ const IR::U32 accumulator = ir.GetW(Rn);
+ const IR::U32U64 data = X(datasize, Rm);
+
+ switch (integral_size) {
+ case 0b00:
+ return ir.CRC32ISO8(accumulator, data);
+ case 0b01:
+ return ir.CRC32ISO16(accumulator, data);
+ case 0b10:
+ return ir.CRC32ISO32(accumulator, data);
+ case 0b11:
+ default:
+ return ir.CRC32ISO64(accumulator, data);
+ }
+ }();
+
+ X(32, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CRC32C(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd) {
+ const u32 integral_size = sz.ZeroExtend();
+
+ if (sf && integral_size != 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ if (!sf && integral_size == 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32 result = [&] {
+ const size_t datasize = sf ? 64 : 32;
+ const IR::U32 accumulator = ir.GetW(Rn);
+ const IR::U32U64 data = X(datasize, Rm);
+
+ switch (integral_size) {
+ case 0b00:
+ return ir.CRC32Castagnoli8(accumulator, data);
+ case 0b01:
+ return ir.CRC32Castagnoli16(accumulator, data);
+ case 0b10:
+ return ir.CRC32Castagnoli32(accumulator, data);
+ case 0b11:
+ default:
+ return ir.CRC32Castagnoli64(accumulator, data);
+ }
+ }();
+
+ X(32, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp
new file mode 100644
index 0000000000..d280e37ee8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp
@@ -0,0 +1,236 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::AND_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (!sf && N) {
+ return ReservedValue();
+ }
+
+ u64 imm;
+ if (auto masks = DecodeBitMasks(N, imms, immr, true)) {
+ imm = masks->wmask;
+ } else {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = X(datasize, Rn);
+
+ const auto result = ir.And(operand1, I(datasize, imm));
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::ORR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (!sf && N) {
+ return ReservedValue();
+ }
+
+ u64 imm;
+ if (auto masks = DecodeBitMasks(N, imms, immr, true)) {
+ imm = masks->wmask;
+ } else {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = X(datasize, Rn);
+
+ const auto result = ir.Or(operand1, I(datasize, imm));
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::EOR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (!sf && N) {
+ return ReservedValue();
+ }
+
+ u64 imm;
+ if (auto masks = DecodeBitMasks(N, imms, immr, true)) {
+ imm = masks->wmask;
+ } else {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = X(datasize, Rn);
+
+ const auto result = ir.Eor(operand1, I(datasize, imm));
+ if (Rd == Reg::SP) {
+ SP(datasize, result);
+ } else {
+ X(datasize, Rd, result);
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::ANDS_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) {
+ if (!sf && N) {
+ return ReservedValue();
+ }
+
+ u64 imm;
+ if (auto masks = DecodeBitMasks(N, imms, immr, true)) {
+ imm = masks->wmask;
+ } else {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const auto operand1 = X(datasize, Rn);
+ const auto result = ir.And(operand1, I(datasize, imm));
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::AND_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.And(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BIC_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.AndNot(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ORR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.Or(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ORN_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)));
+ const auto result = ir.Or(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::EOR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.Eor(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::EON(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)));
+ const auto result = ir.Eor(operand1, operand2);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ANDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.And(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BICS(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) {
+ if (!sf && imm6.Bit<5>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const u8 shift_amount = imm6.ZeroExtend<u8>();
+
+ const auto operand1 = X(datasize, Rn);
+ const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+ const auto result = ir.AndNot(operand1, operand2);
+
+ ir.SetNZCV(ir.NZCVFrom(result));
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp
new file mode 100644
index 0000000000..60910803f0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp
@@ -0,0 +1,100 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::MADD(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 a = X(datasize, Ra);
+ const IR::U32U64 m = X(datasize, Rm);
+ const IR::U32U64 n = X(datasize, Rn);
+
+ const IR::U32U64 result = ir.Add(a, ir.Mul(n, m));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MSUB(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 a = X(datasize, Ra);
+ const IR::U32U64 m = X(datasize, Rm);
+ const IR::U32U64 n = X(datasize, Rn);
+
+ const IR::U32U64 result = ir.Sub(a, ir.Mul(n, m));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const IR::U64 a = X(64, Ra);
+ const IR::U64 m = ir.SignExtendToLong(X(32, Rm));
+ const IR::U64 n = ir.SignExtendToLong(X(32, Rn));
+
+ const IR::U64 result = ir.Add(a, ir.Mul(n, m));
+
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const IR::U64 a = X(64, Ra);
+ const IR::U64 m = ir.SignExtendToLong(X(32, Rm));
+ const IR::U64 n = ir.SignExtendToLong(X(32, Rn));
+
+ const IR::U64 result = ir.Sub(a, ir.Mul(n, m));
+
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SMULH(Reg Rm, Reg Rn, Reg Rd) {
+ const IR::U64 m = X(64, Rm);
+ const IR::U64 n = X(64, Rn);
+
+ const IR::U64 result = ir.SignedMultiplyHigh(n, m);
+
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const IR::U64 a = X(64, Ra);
+ const IR::U64 m = ir.ZeroExtendToLong(X(32, Rm));
+ const IR::U64 n = ir.ZeroExtendToLong(X(32, Rn));
+
+ const IR::U64 result = ir.Add(a, ir.Mul(n, m));
+
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) {
+ const IR::U64 a = X(64, Ra);
+ const IR::U64 m = ir.ZeroExtendToLong(X(32, Rm));
+ const IR::U64 n = ir.ZeroExtendToLong(X(32, Rn));
+
+ const IR::U64 result = ir.Sub(a, ir.Mul(n, m));
+
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UMULH(Reg Rm, Reg Rn, Reg Rd) {
+ const IR::U64 m = X(64, Rm);
+ const IR::U64 n = X(64, Rn);
+
+ const IR::U64 result = ir.UnsignedMultiplyHigh(n, m);
+
+ X(64, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp
new file mode 100644
index 0000000000..48b860a091
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp
@@ -0,0 +1,24 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::ADR(Imm<2> immlo, Imm<19> immhi, Reg Rd) {
+ const u64 imm = concatenate(immhi, immlo).SignExtend<u64>();
+ const u64 base = ir.PC();
+ X(64, Rd, ir.Imm64(base + imm));
+ return true;
+}
+
+bool TranslatorVisitor::ADRP(Imm<2> immlo, Imm<19> immhi, Reg Rd) {
+ const u64 imm = concatenate(immhi, immlo).SignExtend<u64>() << 12;
+ const u64 base = ir.PC() & ~u64(0xFFF);
+ X(64, Rd, ir.Imm64(base + imm));
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp
new file mode 100644
index 0000000000..3484a1e3d8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp
@@ -0,0 +1,139 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::CLZ_int(bool sf, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 result = ir.CountLeadingZeros(operand);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CLS_int(bool sf, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 result = ir.Sub(ir.CountLeadingZeros(ir.Eor(operand, ir.ArithmeticShiftRight(operand, ir.Imm8(u8(datasize))))), I(datasize, 1));
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::RBIT_int(bool sf, Reg Rn, Reg Rd) {
+ const auto rbit32 = [this](const IR::U32& operand) {
+ // x = (x & 0x55555555) << 1 | ((x >> 1) & 0x55555555);
+ const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(operand, ir.Imm32(0x55555555)), ir.Imm8(1));
+ const IR::U32 first_lsr = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(1)), ir.Imm32(0x55555555));
+ const IR::U32 first = ir.Or(first_lsl, first_lsr);
+
+ // x = (x & 0x33333333) << 2 | ((x >> 2) & 0x33333333);
+ const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(first, ir.Imm32(0x33333333)), ir.Imm8(2));
+ const IR::U32 second_lsr = ir.And(ir.LogicalShiftRight(first, ir.Imm8(2)), ir.Imm32(0x33333333));
+ const IR::U32 second = ir.Or(second_lsl, second_lsr);
+
+ // x = (x & 0x0F0F0F0F) << 4 | ((x >> 4) & 0x0F0F0F0F);
+ const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(second, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4));
+ const IR::U32 third_lsr = ir.And(ir.LogicalShiftRight(second, ir.Imm8(4)), ir.Imm32(0x0F0F0F0F));
+ const IR::U32 third = ir.Or(third_lsl, third_lsr);
+
+ // x = (x << 24) | ((x & 0xFF00) << 8) | ((x >> 8) & 0xFF00) | (x >> 24);
+ const IR::U32 fourth_lsl = ir.Or(ir.LogicalShiftLeft(third, ir.Imm8(24)),
+ ir.LogicalShiftLeft(ir.And(third, ir.Imm32(0xFF00)), ir.Imm8(8)));
+ const IR::U32 fourth_lsr = ir.Or(ir.And(ir.LogicalShiftRight(third, ir.Imm8(8)), ir.Imm32(0xFF00)),
+ ir.LogicalShiftRight(third, ir.Imm8(24)));
+ return ir.Or(fourth_lsl, fourth_lsr);
+ };
+
+ const size_t datasize = sf ? 64 : 32;
+ const IR::U32U64 operand = X(datasize, Rn);
+
+ if (sf) {
+ const IR::U32 lsw = rbit32(ir.LeastSignificantWord(operand));
+ const IR::U32 msw = rbit32(ir.MostSignificantWord(operand).result);
+ const IR::U64 result = ir.Pack2x32To1x64(msw, lsw);
+
+ X(datasize, Rd, result);
+ } else {
+ X(datasize, Rd, rbit32(operand));
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::REV(bool sf, bool opc_0, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ if (!sf && opc_0)
+ return UnallocatedEncoding();
+
+ const IR::U32U64 operand = X(datasize, Rn);
+
+ if (sf) {
+ X(datasize, Rd, ir.ByteReverseDual(operand));
+ } else {
+ X(datasize, Rd, ir.ByteReverseWord(operand));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::REV32_int(Reg Rn, Reg Rd) {
+ const IR::U64 operand = ir.GetX(Rn);
+ const IR::U32 lo = ir.ByteReverseWord(ir.LeastSignificantWord(operand));
+ const IR::U32 hi = ir.ByteReverseWord(ir.MostSignificantWord(operand).result);
+ const IR::U64 result = ir.Pack2x32To1x64(lo, hi);
+ X(64, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::REV16_int(bool sf, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ if (sf) {
+ const IR::U64 operand = X(datasize, Rn);
+ const IR::U64 hihalf = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(8)), ir.Imm64(0x00FF00FF00FF00FF));
+ const IR::U64 lohalf = ir.And(ir.LogicalShiftLeft(operand, ir.Imm8(8)), ir.Imm64(0xFF00FF00FF00FF00));
+ const IR::U64 result = ir.Or(hihalf, lohalf);
+ X(datasize, Rd, result);
+ } else {
+ const IR::U32 operand = X(datasize, Rn);
+ const IR::U32 hihalf = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(8)), ir.Imm32(0x00FF00FF));
+ const IR::U32 lohalf = ir.And(ir.LogicalShiftLeft(operand, ir.Imm8(8)), ir.Imm32(0xFF00FF00));
+ const IR::U32 result = ir.Or(hihalf, lohalf);
+ X(datasize, Rd, result);
+ }
+ return true;
+}
+
+bool TranslatorVisitor::UDIV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 m = X(datasize, Rm);
+ const IR::U32U64 n = X(datasize, Rn);
+
+ const IR::U32U64 result = ir.UnsignedDiv(n, m);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SDIV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 m = X(datasize, Rm);
+ const IR::U32U64 n = X(datasize, Rn);
+
+ const IR::U32U64 result = ir.SignedDiv(n, m);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp
new file mode 100644
index 0000000000..1b48abd7e3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp
@@ -0,0 +1,58 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::LSLV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 shift_amount = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.LogicalShiftLeftMasked(operand, shift_amount);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::LSRV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 shift_amount = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.LogicalShiftRightMasked(operand, shift_amount);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ASRV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 shift_amount = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.ArithmeticShiftRightMasked(operand, shift_amount);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::RORV(bool sf, Reg Rm, Reg Rn, Reg Rd) {
+ const size_t datasize = sf ? 64 : 32;
+
+ const IR::U32U64 operand = X(datasize, Rn);
+ const IR::U32U64 shift_amount = X(datasize, Rm);
+
+ const IR::U32U64 result = ir.RotateRightMasked(operand, shift_amount);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp
new file mode 100644
index 0000000000..c1835b815d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp
@@ -0,0 +1,38 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+bool FPCompare(TranslatorVisitor& v, Imm<2> type, Vec Vm, Vec Vn, bool exc_on_qnan, bool cmp_with_zero) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return v.UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = v.V_scalar(*datasize, Vn);
+ IR::U32U64 operand2;
+ if (cmp_with_zero) {
+ operand2 = v.I(*datasize, 0);
+ } else {
+ operand2 = v.V_scalar(*datasize, Vm);
+ }
+
+ const auto nzcv = v.ir.FPCompare(operand1, operand2, exc_on_qnan);
+ v.ir.SetNZCV(nzcv);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::FCMP_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero) {
+ return FPCompare(*this, type, Vm, Vn, false, cmp_with_zero);
+}
+
+bool TranslatorVisitor::FCMPE_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero) {
+ return FPCompare(*this, type, Vm, Vn, true, cmp_with_zero);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp
new file mode 100644
index 0000000000..a0b1022410
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp
@@ -0,0 +1,35 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+bool FPCompare(TranslatorVisitor& v, Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv, bool exc_on_qnan) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return v.UnallocatedEncoding();
+ }
+ const u32 flags = nzcv.ZeroExtend<u32>() << 28;
+
+ const IR::U32U64 operand1 = v.V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = v.V_scalar(*datasize, Vm);
+
+ const IR::NZCV then_flags = v.ir.FPCompare(operand1, operand2, exc_on_qnan);
+ const IR::NZCV else_flags = v.ir.NZCVFromPackedFlags(v.ir.Imm32(flags));
+ v.ir.SetNZCV(v.ir.ConditionalSelect(cond, then_flags, else_flags));
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::FCCMP_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv) {
+ return FPCompare(*this, type, Vm, cond, Vn, nzcv, false);
+}
+
+bool TranslatorVisitor::FCCMPE_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv) {
+ return FPCompare(*this, type, Vm, cond, Vn, nzcv, true);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp
new file mode 100644
index 0000000000..11ce6f623a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp
@@ -0,0 +1,24 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::FCSEL_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+ const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, operand2);
+ V_scalar(*datasize, Vd, result);
+
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp
new file mode 100644
index 0000000000..5fa7ac1f98
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp
@@ -0,0 +1,114 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::SCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize || *fltsize == 16) {
+ return UnallocatedEncoding();
+ }
+ if (!sf && !scale.Bit<5>()) {
+ return UnallocatedEncoding();
+ }
+ const u8 fracbits = 64 - scale.ZeroExtend<u8>();
+ const FP::RoundingMode rounding_mode = ir.current_location->FPCR().RMode();
+
+ const IR::U32U64 intval = X(intsize, Rn);
+ const IR::U32U64 fltval = [&]() -> IR::U32U64 {
+ switch (*fltsize) {
+ case 32:
+ return ir.FPSignedFixedToSingle(intval, fracbits, rounding_mode);
+ case 64:
+ return ir.FPSignedFixedToDouble(intval, fracbits, rounding_mode);
+ }
+ UNREACHABLE();
+ }();
+
+ V_scalar(*fltsize, Vd, fltval);
+ return true;
+}
+
+bool TranslatorVisitor::UCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize || *fltsize == 16) {
+ return UnallocatedEncoding();
+ }
+ if (!sf && !scale.Bit<5>()) {
+ return UnallocatedEncoding();
+ }
+ const u8 fracbits = 64 - scale.ZeroExtend<u8>();
+ const FP::RoundingMode rounding_mode = ir.current_location->FPCR().RMode();
+
+ const IR::U32U64 intval = X(intsize, Rn);
+ const IR::U32U64 fltval = [&]() -> IR::U32U64 {
+ switch (*fltsize) {
+ case 32:
+ return ir.FPUnsignedFixedToSingle(intval, fracbits, rounding_mode);
+ case 64:
+ return ir.FPUnsignedFixedToDouble(intval, fracbits, rounding_mode);
+ }
+ UNREACHABLE();
+ }();
+
+ V_scalar(*fltsize, Vd, fltval);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTZS_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize) {
+ return UnallocatedEncoding();
+ }
+ if (!sf && !scale.Bit<5>()) {
+ return UnallocatedEncoding();
+ }
+ const u8 fracbits = 64 - scale.ZeroExtend<u8>();
+
+ const IR::U16U32U64 fltval = V_scalar(*fltsize, Vn);
+ IR::U32U64 intval;
+ if (intsize == 32) {
+ intval = ir.FPToFixedS32(fltval, fracbits, FP::RoundingMode::TowardsZero);
+ } else if (intsize == 64) {
+ intval = ir.FPToFixedS64(fltval, fracbits, FP::RoundingMode::TowardsZero);
+ } else {
+ UNREACHABLE();
+ }
+
+ X(intsize, Rd, intval);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTZU_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize) {
+ return UnallocatedEncoding();
+ }
+ if (!sf && !scale.Bit<5>()) {
+ return UnallocatedEncoding();
+ }
+ const u8 fracbits = 64 - scale.ZeroExtend<u8>();
+
+ const IR::U16U32U64 fltval = V_scalar(*fltsize, Vn);
+ IR::U32U64 intval;
+ if (intsize == 32) {
+ intval = ir.FPToFixedU32(fltval, fracbits, FP::RoundingMode::TowardsZero);
+ } else if (intsize == 64) {
+ intval = ir.FPToFixedU64(fltval, fracbits, FP::RoundingMode::TowardsZero);
+ } else {
+ UNREACHABLE();
+ }
+
+ X(intsize, Rd, intval);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp
new file mode 100644
index 0000000000..2f98ebfb13
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp
@@ -0,0 +1,199 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::SCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize || *fltsize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 intval = X(intsize, Rn);
+ IR::U32U64 fltval;
+
+ if (*fltsize == 32) {
+ fltval = ir.FPSignedFixedToSingle(intval, 0, ir.current_location->FPCR().RMode());
+ } else if (*fltsize == 64) {
+ fltval = ir.FPSignedFixedToDouble(intval, 0, ir.current_location->FPCR().RMode());
+ } else {
+ UNREACHABLE();
+ }
+
+ V_scalar(*fltsize, Vd, fltval);
+
+ return true;
+}
+
+bool TranslatorVisitor::UCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize || *fltsize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 intval = X(intsize, Rn);
+ IR::U32U64 fltval;
+
+ if (*fltsize == 32) {
+ fltval = ir.FPUnsignedFixedToSingle(intval, 0, ir.current_location->FPCR().RMode());
+ } else if (*fltsize == 64) {
+ fltval = ir.FPUnsignedFixedToDouble(intval, 0, ir.current_location->FPCR().RMode());
+ } else {
+ UNREACHABLE();
+ }
+
+ V_scalar(*fltsize, Vd, fltval);
+
+ return true;
+}
+
+bool TranslatorVisitor::FMOV_float_gen(bool sf, Imm<2> type, Imm<1> rmode_0, Imm<1> opc_0, size_t n, size_t d) {
+ // NOTE:
+ // opcode<2:1> == 0b11
+ // rmode<1> == 0b0
+
+ if (type == 0b10 && rmode_0 != 1) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t intsize = sf ? 64 : 32;
+ size_t fltsize = [type] {
+ switch (type.ZeroExtend()) {
+ case 0b00:
+ return 32;
+ case 0b01:
+ return 64;
+ case 0b10:
+ return 128;
+ case 0b11:
+ return 16;
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ bool integer_to_float;
+ size_t part;
+ switch (rmode_0.ZeroExtend()) {
+ case 0b0:
+ if (fltsize != 16 && fltsize != intsize) {
+ return UnallocatedEncoding();
+ }
+ integer_to_float = opc_0 == 0b1;
+ part = 0;
+ break;
+ default:
+ case 0b1:
+ if (intsize != 64 || fltsize != 128) {
+ return UnallocatedEncoding();
+ }
+ integer_to_float = opc_0 == 0b1;
+ part = 1;
+ fltsize = 64;
+ break;
+ }
+
+ if (integer_to_float) {
+ const IR::U16U32U64 intval = X(fltsize, static_cast<Reg>(n));
+ Vpart_scalar(fltsize, static_cast<Vec>(d), part, intval);
+ } else {
+ const IR::UAny fltval = Vpart_scalar(fltsize, static_cast<Vec>(n), part);
+ const IR::U32U64 intval = ZeroExtend(fltval, intsize);
+ X(intsize, static_cast<Reg>(d), intval);
+ }
+
+ return true;
+}
+
+static bool FloaingPointConvertSignedInteger(TranslatorVisitor& v, bool sf, Imm<2> type, Vec Vn, Reg Rd, FP::RoundingMode rounding_mode) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize) {
+ return v.UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 fltval = v.V_scalar(*fltsize, Vn);
+ IR::U32U64 intval;
+
+ if (intsize == 32) {
+ intval = v.ir.FPToFixedS32(fltval, 0, rounding_mode);
+ } else if (intsize == 64) {
+ intval = v.ir.FPToFixedS64(fltval, 0, rounding_mode);
+ } else {
+ UNREACHABLE();
+ }
+
+ v.X(intsize, Rd, intval);
+ return true;
+}
+
+static bool FloaingPointConvertUnsignedInteger(TranslatorVisitor& v, bool sf, Imm<2> type, Vec Vn, Reg Rd, FP::RoundingMode rounding_mode) {
+ const size_t intsize = sf ? 64 : 32;
+ const auto fltsize = FPGetDataSize(type);
+ if (!fltsize) {
+ return v.UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 fltval = v.V_scalar(*fltsize, Vn);
+ IR::U32U64 intval;
+
+ if (intsize == 32) {
+ intval = v.ir.FPToFixedU32(fltval, 0, rounding_mode);
+ } else if (intsize == 64) {
+ intval = v.ir.FPToFixedU64(fltval, 0, rounding_mode);
+ } else {
+ UNREACHABLE();
+ }
+
+ v.X(intsize, Rd, intval);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTNS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieEven);
+}
+
+bool TranslatorVisitor::FCVTNU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieEven);
+}
+
+bool TranslatorVisitor::FCVTZS_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FCVTZU_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FCVTAS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+
+bool TranslatorVisitor::FCVTAU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+
+bool TranslatorVisitor::FCVTPS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsPlusInfinity);
+}
+
+bool TranslatorVisitor::FCVTPU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsPlusInfinity);
+}
+
+bool TranslatorVisitor::FCVTMS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsMinusInfinity);
+}
+
+bool TranslatorVisitor::FCVTMU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) {
+ return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsMinusInfinity);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp
new file mode 100644
index 0000000000..187f0a8908
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp
@@ -0,0 +1,186 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::FMOV_float(Imm<2> type, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operand = V_scalar(*datasize, Vn);
+
+ V_scalar(*datasize, Vd, operand);
+ return true;
+}
+
+bool TranslatorVisitor::FABS_float(Imm<2> type, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operand = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 result = ir.FPAbs(operand);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNEG_float(Imm<2> type, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operand = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 result = ir.FPNeg(operand);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FSQRT_float(Imm<2> type, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand = V_scalar(*datasize, Vn);
+ const IR::U32U64 result = ir.FPSqrt(operand);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMOV_float_imm(Imm<2> type, Imm<8> imm8, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ IR::UAny result = [&]() -> IR::UAny {
+ switch (*datasize) {
+ case 16: {
+ const u16 sign = imm8.Bit<7>() ? 1 : 0;
+ const u16 exp = (imm8.Bit<6>() ? 0b0'1100 : 0b1'0000) | imm8.Bits<4, 5, u16>();
+ const u16 fract = imm8.Bits<0, 3, u16>() << 6;
+ return ir.Imm16((sign << 15) | (exp << 10) | fract);
+ }
+ case 32: {
+ const u32 sign = imm8.Bit<7>() ? 1 : 0;
+ const u32 exp = (imm8.Bit<6>() ? 0b0111'1100 : 0b1000'0000) | imm8.Bits<4, 5, u32>();
+ const u32 fract = imm8.Bits<0, 3, u32>() << 19;
+ return ir.Imm32((sign << 31) | (exp << 23) | fract);
+ }
+ case 64:
+ default: {
+ const u64 sign = imm8.Bit<7>() ? 1 : 0;
+ const u64 exp = (imm8.Bit<6>() ? 0b011'1111'1100 : 0b100'0000'0000) | imm8.Bits<4, 5, u64>();
+ const u64 fract = imm8.Bits<0, 3, u64>() << 48;
+ return ir.Imm64((sign << 63) | (exp << 52) | fract);
+ }
+ }
+ }();
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCVT_float(Imm<2> type, Imm<2> opc, Vec Vn, Vec Vd) {
+ if (type == opc) {
+ return UnallocatedEncoding();
+ }
+
+ const auto srcsize = FPGetDataSize(type);
+ const auto dstsize = FPGetDataSize(opc);
+
+ if (!srcsize || !dstsize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::UAny operand = V_scalar(*srcsize, Vn);
+ const auto rounding_mode = ir.current_location->FPCR().RMode();
+
+ IR::UAny result;
+ switch (*srcsize) {
+ case 16:
+ switch (*dstsize) {
+ case 32:
+ result = ir.FPHalfToSingle(operand, rounding_mode);
+ break;
+ case 64:
+ result = ir.FPHalfToDouble(operand, rounding_mode);
+ break;
+ }
+ break;
+ case 32:
+ switch (*dstsize) {
+ case 16:
+ result = ir.FPSingleToHalf(operand, rounding_mode);
+ break;
+ case 64:
+ result = ir.FPSingleToDouble(operand, rounding_mode);
+ break;
+ }
+ break;
+ case 64:
+ switch (*dstsize) {
+ case 16:
+ result = ir.FPDoubleToHalf(operand, rounding_mode);
+ break;
+ case 32:
+ result = ir.FPDoubleToSingle(operand, rounding_mode);
+ break;
+ }
+ break;
+ }
+
+ V_scalar(*dstsize, Vd, result);
+
+ return true;
+}
+
+static bool FloatingPointRoundToIntegral(TranslatorVisitor& v, Imm<2> type, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return v.UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operand = v.V_scalar(*datasize, Vn);
+ const IR::U16U32U64 result = v.ir.FPRoundInt(operand, rounding_mode, exact);
+ v.V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRINTN_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false);
+}
+
+bool TranslatorVisitor::FRINTP_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTM_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTZ_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsZero, false);
+}
+
+bool TranslatorVisitor::FRINTA_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false);
+}
+
+bool TranslatorVisitor::FRINTX_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, ir.current_location->FPCR().RMode(), true);
+}
+
+bool TranslatorVisitor::FRINTI_float(Imm<2> type, Vec Vn, Vec Vd) {
+ return FloatingPointRoundToIntegral(*this, type, Vn, Vd, ir.current_location->FPCR().RMode(), false);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp
new file mode 100644
index 0000000000..75c38c012b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp
@@ -0,0 +1,66 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
+ const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
+ const IR::U16U32U64 result = ir.FPMulAdd(operanda, operand1, operand2);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
+ const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
+ const IR::U16U32U64 result = ir.FPMulSub(operanda, operand1, operand2);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
+ const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
+ const IR::U16U32U64 result = ir.FPMulSub(ir.FPNeg(operanda), operand1, operand2);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U16U32U64 operanda = V_scalar(*datasize, Va);
+ const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm);
+ const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), operand1, operand2);
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp
new file mode 100644
index 0000000000..a4dbda9cce
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp
@@ -0,0 +1,145 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::FMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPMul(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FDIV_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPDiv(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FADD_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPAdd(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FSUB_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPSub(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMAX_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPMax(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMIN_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPMin(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMAXNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPMaxNumeric(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMINNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPMinNumeric(operand1, operand2);
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) {
+ const auto datasize = FPGetDataSize(type);
+ if (!datasize || *datasize == 16) {
+ return UnallocatedEncoding();
+ }
+
+ const IR::U32U64 operand1 = V_scalar(*datasize, Vn);
+ const IR::U32U64 operand2 = V_scalar(*datasize, Vm);
+
+ const IR::U32U64 result = ir.FPNeg(ir.FPMul(operand1, operand2));
+
+ V_scalar(*datasize, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp
new file mode 100644
index 0000000000..d0fd93867c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp
@@ -0,0 +1,409 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+#include <mcl/bit/bit_count.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bit/rotate.hpp>
+
+#include "dynarmic/ir/terminal.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::InterpretThisInstruction() {
+ ir.SetTerm(IR::Term::Interpret(*ir.current_location));
+ return false;
+}
+
+bool TranslatorVisitor::UnpredictableInstruction() {
+ return RaiseException(Exception::UnpredictableInstruction);
+}
+
+bool TranslatorVisitor::DecodeError() {
+ UNREACHABLE();
+}
+
+bool TranslatorVisitor::ReservedValue() {
+ return RaiseException(Exception::ReservedValue);
+}
+
+bool TranslatorVisitor::UnallocatedEncoding() {
+ return RaiseException(Exception::UnallocatedEncoding);
+}
+
+bool TranslatorVisitor::RaiseException(Exception exception) {
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.ExceptionRaised(exception);
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}});
+ return false;
+}
+
+std::optional<TranslatorVisitor::BitMasks> TranslatorVisitor::DecodeBitMasks(bool immN, Imm<6> imms, Imm<6> immr, bool immediate) {
+ const int len = mcl::bit::highest_set_bit((immN ? 1 << 6 : 0) | (imms.ZeroExtend() ^ 0b111111));
+ if (len < 1) {
+ return std::nullopt;
+ }
+
+ const size_t levels = mcl::bit::ones<size_t>(len);
+ if (immediate && (imms.ZeroExtend() & levels) == levels) {
+ return std::nullopt;
+ }
+
+ const s32 S = s32(imms.ZeroExtend() & levels);
+ const s32 R = s32(immr.ZeroExtend() & levels);
+ const u64 d = u64(S - R) & levels;
+
+ const size_t esize = size_t{1} << len;
+ const u64 welem = mcl::bit::ones<u64>(S + 1);
+ const u64 telem = mcl::bit::ones<u64>(d + 1);
+ const u64 wmask = mcl::bit::rotate_right(mcl::bit::replicate_element<u64>(esize, welem), R);
+ const u64 tmask = mcl::bit::replicate_element<u64>(esize, telem);
+
+ return BitMasks{wmask, tmask};
+}
+
+IR::UAny TranslatorVisitor::I(size_t bitsize, u64 value) {
+ switch (bitsize) {
+ case 8:
+ return ir.Imm8(static_cast<u8>(value));
+ case 16:
+ return ir.Imm16(static_cast<u16>(value));
+ case 32:
+ return ir.Imm32(static_cast<u32>(value));
+ case 64:
+ return ir.Imm64(value);
+ default:
+ ASSERT_FALSE("Imm - get: Invalid bitsize");
+ }
+}
+
+IR::UAny TranslatorVisitor::X(size_t bitsize, Reg reg) {
+ switch (bitsize) {
+ case 8:
+ return ir.LeastSignificantByte(ir.GetW(reg));
+ case 16:
+ return ir.LeastSignificantHalf(ir.GetW(reg));
+ case 32:
+ return ir.GetW(reg);
+ case 64:
+ return ir.GetX(reg);
+ default:
+ ASSERT_FALSE("X - get: Invalid bitsize");
+ }
+}
+
+void TranslatorVisitor::X(size_t bitsize, Reg reg, IR::U32U64 value) {
+ switch (bitsize) {
+ case 32:
+ ir.SetW(reg, value);
+ return;
+ case 64:
+ ir.SetX(reg, value);
+ return;
+ default:
+ ASSERT_FALSE("X - set: Invalid bitsize");
+ }
+}
+
+IR::U32U64 TranslatorVisitor::SP(size_t bitsize) {
+ switch (bitsize) {
+ case 32:
+ return ir.LeastSignificantWord(ir.GetSP());
+ case 64:
+ return ir.GetSP();
+ default:
+ ASSERT_FALSE("SP - get : Invalid bitsize");
+ }
+}
+
+void TranslatorVisitor::SP(size_t bitsize, IR::U32U64 value) {
+ switch (bitsize) {
+ case 32:
+ ir.SetSP(ir.ZeroExtendWordToLong(value));
+ break;
+ case 64:
+ ir.SetSP(value);
+ break;
+ default:
+ ASSERT_FALSE("SP - set : Invalid bitsize");
+ }
+}
+
+IR::U128 TranslatorVisitor::V(size_t bitsize, Vec vec) {
+ switch (bitsize) {
+ case 32:
+ return ir.GetS(vec);
+ case 64:
+ return ir.GetD(vec);
+ case 128:
+ return ir.GetQ(vec);
+ default:
+ ASSERT_FALSE("V - get : Invalid bitsize");
+ }
+}
+
+void TranslatorVisitor::V(size_t bitsize, Vec vec, IR::U128 value) {
+ switch (bitsize) {
+ case 32:
+ ir.SetS(vec, value);
+ return;
+ case 64:
+ // TODO: Remove VectorZeroUpper when possible.
+ ir.SetD(vec, ir.VectorZeroUpper(value));
+ return;
+ case 128:
+ ir.SetQ(vec, value);
+ return;
+ default:
+ ASSERT_FALSE("V - Set : Invalid bitsize");
+ }
+}
+
+IR::UAnyU128 TranslatorVisitor::V_scalar(size_t bitsize, Vec vec) {
+ if (bitsize == 128) {
+ return V(128, vec);
+ }
+ // TODO: Optimize
+ return ir.VectorGetElement(bitsize, ir.GetQ(vec), 0);
+}
+
+void TranslatorVisitor::V_scalar(size_t bitsize, Vec vec, IR::UAnyU128 value) {
+ if (bitsize == 128) {
+ V(128, vec, value);
+ return;
+ }
+ // TODO: Optimize
+ ir.SetQ(vec, ir.ZeroExtendToQuad(value));
+}
+
+IR::U128 TranslatorVisitor::Vpart(size_t bitsize, Vec vec, size_t part) {
+ ASSERT(part == 0 || part == 1);
+ ASSERT(bitsize == 64);
+ if (part == 0) {
+ return V(64, vec);
+ }
+ return ir.ZeroExtendToQuad(ir.VectorGetElement(bitsize, V(128, vec), part));
+}
+
+void TranslatorVisitor::Vpart(size_t bitsize, Vec vec, size_t part, IR::U128 value) {
+ ASSERT(part == 0 || part == 1);
+ if (part == 0) {
+ ASSERT(bitsize == 64);
+ V(128, vec, ir.VectorZeroExtend(bitsize, value));
+ } else {
+ ASSERT(bitsize == 64);
+ V(128, vec, ir.VectorInterleaveLower(64, V(128, vec), value));
+ }
+}
+
+IR::UAny TranslatorVisitor::Vpart_scalar(size_t bitsize, Vec vec, size_t part) {
+ ASSERT(part == 0 || part == 1);
+ if (part == 0) {
+ ASSERT(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64);
+ } else {
+ ASSERT(bitsize == 64);
+ }
+ return ir.VectorGetElement(bitsize, V(128, vec), part);
+}
+
+void TranslatorVisitor::Vpart_scalar(size_t bitsize, Vec vec, size_t part, IR::UAny value) {
+ ASSERT(part == 0 || part == 1);
+ if (part == 0) {
+ ASSERT(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64);
+ V(128, vec, ir.ZeroExtendToQuad(value));
+ } else {
+ ASSERT(bitsize == 64);
+ V(128, vec, ir.VectorSetElement(64, V(128, vec), 1, value));
+ }
+}
+
+IR::UAnyU128 TranslatorVisitor::Mem(IR::U64 address, size_t bytesize, IR::AccType acc_type) {
+ switch (bytesize) {
+ case 1:
+ return ir.ReadMemory8(address, acc_type);
+ case 2:
+ return ir.ReadMemory16(address, acc_type);
+ case 4:
+ return ir.ReadMemory32(address, acc_type);
+ case 8:
+ return ir.ReadMemory64(address, acc_type);
+ case 16:
+ return ir.ReadMemory128(address, acc_type);
+ default:
+ ASSERT_FALSE("Invalid bytesize parameter {}", bytesize);
+ }
+}
+
+void TranslatorVisitor::Mem(IR::U64 address, size_t bytesize, IR::AccType acc_type, IR::UAnyU128 value) {
+ switch (bytesize) {
+ case 1:
+ ir.WriteMemory8(address, value, acc_type);
+ return;
+ case 2:
+ ir.WriteMemory16(address, value, acc_type);
+ return;
+ case 4:
+ ir.WriteMemory32(address, value, acc_type);
+ return;
+ case 8:
+ ir.WriteMemory64(address, value, acc_type);
+ return;
+ case 16:
+ ir.WriteMemory128(address, value, acc_type);
+ return;
+ default:
+ ASSERT_FALSE("Invalid bytesize parameter {}", bytesize);
+ }
+}
+
+IR::UAnyU128 TranslatorVisitor::ExclusiveMem(IR::U64 address, size_t bytesize, IR::AccType acc_type) {
+ switch (bytesize) {
+ case 1:
+ return ir.ExclusiveReadMemory8(address, acc_type);
+ case 2:
+ return ir.ExclusiveReadMemory16(address, acc_type);
+ case 4:
+ return ir.ExclusiveReadMemory32(address, acc_type);
+ case 8:
+ return ir.ExclusiveReadMemory64(address, acc_type);
+ case 16:
+ return ir.ExclusiveReadMemory128(address, acc_type);
+ default:
+ ASSERT_FALSE("Invalid bytesize parameter {}", bytesize);
+ }
+}
+
+IR::U32 TranslatorVisitor::ExclusiveMem(IR::U64 address, size_t bytesize, IR::AccType acc_type, IR::UAnyU128 value) {
+ switch (bytesize) {
+ case 1:
+ return ir.ExclusiveWriteMemory8(address, value, acc_type);
+ case 2:
+ return ir.ExclusiveWriteMemory16(address, value, acc_type);
+ case 4:
+ return ir.ExclusiveWriteMemory32(address, value, acc_type);
+ case 8:
+ return ir.ExclusiveWriteMemory64(address, value, acc_type);
+ case 16:
+ return ir.ExclusiveWriteMemory128(address, value, acc_type);
+ default:
+ ASSERT_FALSE("Invalid bytesize parameter {}", bytesize);
+ }
+}
+
+IR::U32U64 TranslatorVisitor::SignExtend(IR::UAny value, size_t to_size) {
+ switch (to_size) {
+ case 32:
+ return ir.SignExtendToWord(value);
+ case 64:
+ return ir.SignExtendToLong(value);
+ default:
+ ASSERT_FALSE("Invalid size parameter {}", to_size);
+ }
+}
+
+IR::U32U64 TranslatorVisitor::ZeroExtend(IR::UAny value, size_t to_size) {
+ switch (to_size) {
+ case 32:
+ return ir.ZeroExtendToWord(value);
+ case 64:
+ return ir.ZeroExtendToLong(value);
+ default:
+ ASSERT_FALSE("Invalid size parameter {}", to_size);
+ }
+}
+
+IR::U32U64 TranslatorVisitor::ShiftReg(size_t bitsize, Reg reg, Imm<2> shift, IR::U8 amount) {
+ IR::U32U64 result = X(bitsize, reg);
+ switch (shift.ZeroExtend()) {
+ case 0b00:
+ return ir.LogicalShiftLeft(result, amount);
+ case 0b01:
+ return ir.LogicalShiftRight(result, amount);
+ case 0b10:
+ return ir.ArithmeticShiftRight(result, amount);
+ case 0b11:
+ return ir.RotateRight(result, amount);
+ }
+ UNREACHABLE();
+}
+
+IR::U32U64 TranslatorVisitor::ExtendReg(size_t bitsize, Reg reg, Imm<3> option, u8 shift) {
+ ASSERT(shift <= 4);
+ ASSERT(bitsize == 32 || bitsize == 64);
+ IR::UAny val = X(bitsize, reg);
+ size_t len;
+ IR::U32U64 extended;
+ bool signed_extend;
+
+ switch (option.ZeroExtend()) {
+ case 0b000: { // UXTB
+ val = ir.LeastSignificantByte(val);
+ len = 8;
+ signed_extend = false;
+ break;
+ }
+ case 0b001: { // UXTH
+ val = ir.LeastSignificantHalf(val);
+ len = 16;
+ signed_extend = false;
+ break;
+ }
+ case 0b010: { // UXTW
+ if (bitsize != 32) {
+ val = ir.LeastSignificantWord(val);
+ }
+ len = 32;
+ signed_extend = false;
+ break;
+ }
+ case 0b011: { // UXTX
+ len = 64;
+ signed_extend = false;
+ break;
+ }
+ case 0b100: { // SXTB
+ val = ir.LeastSignificantByte(val);
+ len = 8;
+ signed_extend = true;
+ break;
+ }
+ case 0b101: { // SXTH
+ val = ir.LeastSignificantHalf(val);
+ len = 16;
+ signed_extend = true;
+ break;
+ }
+ case 0b110: { // SXTW
+ if (bitsize != 32) {
+ val = ir.LeastSignificantWord(val);
+ }
+ len = 32;
+ signed_extend = true;
+ break;
+ }
+ case 0b111: { // SXTX
+ len = 64;
+ signed_extend = true;
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ if (len < bitsize) {
+ if (bitsize == 32) {
+ extended = signed_extend ? ir.SignExtendToWord(val) : ir.ZeroExtendToWord(val);
+ } else {
+ extended = signed_extend ? ir.SignExtendToLong(val) : ir.ZeroExtendToLong(val);
+ }
+ } else {
+ extended = val;
+ }
+
+ return ir.LogicalShiftLeft(extended, ir.Imm8(shift));
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h
new file mode 100644
index 0000000000..86bc534d5e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h
@@ -0,0 +1,1084 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "dynarmic/frontend/A64/a64_ir_emitter.h"
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/a64_types.h"
+#include "dynarmic/frontend/A64/translate/a64_translate.h"
+#include "dynarmic/frontend/imm.h"
+
+namespace Dynarmic::A64 {
+
+struct TranslatorVisitor final {
+ using instruction_return_type = bool;
+
+ explicit TranslatorVisitor(IR::Block& block, LocationDescriptor descriptor, TranslationOptions options)
+ : ir(block, descriptor), options(std::move(options)) {}
+
+ A64::IREmitter ir;
+ TranslationOptions options;
+
+ bool InterpretThisInstruction();
+ bool UnpredictableInstruction();
+ bool DecodeError();
+ bool ReservedValue();
+ bool UnallocatedEncoding();
+ bool RaiseException(Exception exception);
+
+ struct BitMasks {
+ u64 wmask, tmask;
+ };
+
+ static std::optional<BitMasks> DecodeBitMasks(bool immN, Imm<6> imms, Imm<6> immr, bool immediate);
+
+ IR::UAny I(size_t bitsize, u64 value);
+ IR::UAny X(size_t bitsize, Reg reg);
+ void X(size_t bitsize, Reg reg, IR::U32U64 value);
+ IR::U32U64 SP(size_t bitsize);
+ void SP(size_t bitsize, IR::U32U64 value);
+
+ IR::U128 V(size_t bitsize, Vec vec);
+ void V(size_t bitsize, Vec vec, IR::U128 value);
+
+ IR::UAnyU128 V_scalar(size_t bitsize, Vec vec);
+ void V_scalar(size_t bitsize, Vec vec, IR::UAnyU128 value);
+
+ IR::U128 Vpart(size_t bitsize, Vec vec, size_t part);
+ void Vpart(size_t bitsize, Vec vec, size_t part, IR::U128 value);
+
+ IR::UAny Vpart_scalar(size_t bitsize, Vec vec, size_t part);
+ void Vpart_scalar(size_t bitsize, Vec vec, size_t part, IR::UAny value);
+
+ IR::UAnyU128 Mem(IR::U64 address, size_t size, IR::AccType acctype);
+ void Mem(IR::U64 address, size_t size, IR::AccType acctype, IR::UAnyU128 value);
+ IR::UAnyU128 ExclusiveMem(IR::U64 address, size_t size, IR::AccType acctype);
+ IR::U32 ExclusiveMem(IR::U64 address, size_t size, IR::AccType acctype, IR::UAnyU128 value);
+
+ IR::U32U64 SignExtend(IR::UAny value, size_t to_size);
+ IR::U32U64 ZeroExtend(IR::UAny value, size_t to_size);
+ IR::U32U64 ShiftReg(size_t bitsize, Reg reg, Imm<2> shift, IR::U8 amount);
+ IR::U32U64 ExtendReg(size_t bitsize, Reg reg, Imm<3> option, u8 shift);
+
+ // Data processing - Immediate - PC relative addressing
+ bool ADR(Imm<2> immlo, Imm<19> immhi, Reg Rd);
+ bool ADRP(Imm<2> immlo, Imm<19> immhi, Reg Rd);
+
+ // Data processing - Immediate - Add/Sub (with tag)
+ bool ADDG(Imm<6> offset_imm, Imm<4> tag_offset, Reg Rn, Reg Rd);
+ bool SUBG(Imm<6> offset_imm, Imm<4> tag_offset, Reg Rn, Reg Rd);
+
+ // Data processing - Immediate - Add/Sub
+ bool ADD_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd);
+ bool ADDS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd);
+ bool SUB_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd);
+ bool SUBS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd);
+
+ // Data processing - Immediate - Logical
+ bool AND_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool ORR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool EOR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool ANDS_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+
+ // Data processing - Immediate - Move Wide
+ bool MOVN(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd);
+ bool MOVZ(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd);
+ bool MOVK(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd);
+
+ // Data processing - Immediate - Bitfield
+ bool SBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool BFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool UBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd);
+ bool ASR_1(Imm<5> immr, Reg Rn, Reg Rd);
+ bool ASR_2(Imm<6> immr, Reg Rn, Reg Rd);
+ bool SXTB_1(Reg Rn, Reg Rd);
+ bool SXTB_2(Reg Rn, Reg Rd);
+ bool SXTH_1(Reg Rn, Reg Rd);
+ bool SXTH_2(Reg Rn, Reg Rd);
+ bool SXTW(Reg Rn, Reg Rd);
+
+ // Data processing - Immediate - Extract
+ bool EXTR(bool sf, bool N, Reg Rm, Imm<6> imms, Reg Rn, Reg Rd);
+
+ // Conditional branch
+ bool B_cond(Imm<19> imm19, Cond cond);
+
+ // Exception generation
+ bool SVC(Imm<16> imm16);
+ bool HVC(Imm<16> imm16);
+ bool SMC(Imm<16> imm16);
+ bool BRK(Imm<16> imm16);
+ bool HLT(Imm<16> imm16);
+ bool DCPS1(Imm<16> imm16);
+ bool DCPS2(Imm<16> imm16);
+ bool DCPS3(Imm<16> imm16);
+
+ // System
+ bool MSR_imm(Imm<3> op1, Imm<4> CRm, Imm<3> op2);
+ bool HINT(Imm<4> CRm, Imm<3> op2);
+ bool NOP();
+ bool YIELD();
+ bool WFE();
+ bool WFI();
+ bool SEV();
+ bool SEVL();
+ bool XPAC_1(bool D, Reg Rd);
+ bool XPAC_2();
+ bool PACIA_1(bool Z, Reg Rn, Reg Rd);
+ bool PACIA_2();
+ bool PACIB_1(bool Z, Reg Rn, Reg Rd);
+ bool PACIB_2();
+ bool AUTIA_1(bool Z, Reg Rn, Reg Rd);
+ bool AUTIA_2();
+ bool AUTIB_1(bool Z, Reg Rn, Reg Rd);
+ bool AUTIB_2();
+ bool BTI(Imm<2> upper_op2);
+ bool ESB();
+ bool PSB();
+ bool TSB();
+ bool CSDB();
+ bool CLREX(Imm<4> CRm);
+ bool DSB(Imm<4> CRm);
+ bool SSBB();
+ bool PSSBB();
+ bool DMB(Imm<4> CRm);
+ bool ISB(Imm<4> CRm);
+ bool SYS(Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt);
+ bool SB();
+ bool MSR_reg(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt);
+ bool SYSL(Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt);
+ bool MRS(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt);
+
+ // System - Flag manipulation instructions
+ bool CFINV();
+ bool RMIF(Imm<6> lsb, Reg Rn, Imm<4> mask);
+ bool SETF8(Reg Rn);
+ bool SETF16(Reg Rn);
+
+ // System - Flag format instructions
+ bool XAFlag();
+ bool AXFlag();
+
+ // SYS: Data Cache
+ bool DC_IVAC(Reg Rt);
+ bool DC_ISW(Reg Rt);
+ bool DC_CSW(Reg Rt);
+ bool DC_CISW(Reg Rt);
+ bool DC_ZVA(Reg Rt);
+ bool DC_CVAC(Reg Rt);
+ bool DC_CVAU(Reg Rt);
+ bool DC_CVAP(Reg Rt);
+ bool DC_CIVAC(Reg Rt);
+
+ // SYS: Instruction Cache
+ bool IC_IALLU();
+ bool IC_IALLUIS();
+ bool IC_IVAU(Reg Rt);
+
+ // Unconditional branch (Register)
+ bool BR(Reg Rn);
+ bool BRA(bool Z, bool M, Reg Rn, Reg Rm);
+ bool BLR(Reg Rn);
+ bool BLRA(bool Z, bool M, Reg Rn, Reg Rm);
+ bool RET(Reg Rn);
+ bool RETA(bool M);
+ bool ERET();
+ bool ERETA(bool M);
+ bool DRPS();
+
+ // Unconditional branch (immediate)
+ bool B_uncond(Imm<26> imm26);
+ bool BL(Imm<26> imm26);
+
+ // Compare and branch (immediate)
+ bool CBZ(bool sf, Imm<19> imm19, Reg Rt);
+ bool CBNZ(bool sf, Imm<19> imm19, Reg Rt);
+ bool TBZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt);
+ bool TBNZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt);
+
+ // Loads and stores - Advanced SIMD Load/Store multiple structures
+ bool STx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt);
+ bool STx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt);
+ bool LDx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt);
+ bool LDx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt);
+
+ // Loads and stores - Advanced SIMD Load/Store single structures
+ bool ST1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool ST4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD1R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD1R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD3R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD3R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD2R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD2R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD4R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt);
+ bool LD4R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt);
+
+ // Loads and stores - Load/Store Exclusive
+ bool STXR(Imm<2> size, Reg Rs, Reg Rn, Reg Rt);
+ bool STLXR(Imm<2> size, Reg Rs, Reg Rn, Reg Rt);
+ bool STXP(Imm<1> size, Reg Rs, Reg Rt2, Reg Rn, Reg Rt);
+ bool STLXP(Imm<1> size, Reg Rs, Reg Rt2, Reg Rn, Reg Rt);
+ bool LDXR(Imm<2> size, Reg Rn, Reg Rt);
+ bool LDAXR(Imm<2> size, Reg Rn, Reg Rt);
+ bool LDXP(Imm<1> size, Reg Rt2, Reg Rn, Reg Rt);
+ bool LDAXP(Imm<1> size, Reg Rt2, Reg Rn, Reg Rt);
+ bool STLLR(Imm<2> size, Reg Rn, Reg Rt);
+ bool STLR(Imm<2> size, Reg Rn, Reg Rt);
+ bool LDLAR(Imm<2> size, Reg Rn, Reg Rt);
+ bool LDAR(Imm<2> size, Reg Rn, Reg Rt);
+ bool CASP(bool sz, bool L, Reg Rs, bool o0, Reg Rn, Reg Rt);
+ bool CASB(bool L, Reg Rs, bool o0, Reg Rn, Reg Rt);
+ bool CASH(bool L, Reg Rs, bool o0, Reg Rn, Reg Rt);
+ bool CAS(bool sz, bool L, Reg Rs, bool o0, Reg Rn, Reg Rt);
+
+ // Loads and stores - Load register (literal)
+ bool LDR_lit_gen(bool opc_0, Imm<19> imm19, Reg Rt);
+ bool LDR_lit_fpsimd(Imm<2> opc, Imm<19> imm19, Vec Vt);
+ bool LDRSW_lit(Imm<19> imm19, Reg Rt);
+ bool PRFM_lit(Imm<19> imm19, Imm<5> prfop);
+
+ // Loads and stores - Load/Store no-allocate pair
+ bool STNP_LDNP_gen(Imm<1> upper_opc, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt);
+ bool STNP_LDNP_fpsimd(Imm<2> opc, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt);
+
+ // Loads and stores - Load/Store register pair
+ bool STP_LDP_gen(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt);
+ bool STP_LDP_fpsimd(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt);
+ bool STGP_1(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt);
+ bool STGP_2(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt);
+ bool STGP_3(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt);
+
+ // Loads and stores - Load/Store register (immediate)
+ bool STRx_LDRx_imm_1(Imm<2> size, Imm<2> opc, Imm<9> imm9, bool not_postindex, Reg Rn, Reg Rt);
+ bool STRx_LDRx_imm_2(Imm<2> size, Imm<2> opc, Imm<12> imm12, Reg Rn, Reg Rt);
+ bool STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt);
+ bool PRFM_imm(Imm<12> imm12, Reg Rn, Reg Rt);
+ bool PRFM_unscaled_imm(Imm<9> imm9, Reg Rn, Reg Rt);
+ bool STR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt);
+ bool STR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt);
+ bool LDR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt);
+ bool LDR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt);
+ bool STUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt);
+ bool LDUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt);
+
+ // Loads and stores - Load/Store register (unprivileged)
+ bool STTRB(Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTRB(Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTRSB(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt);
+ bool STTRH(Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTRH(Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTRSH(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt);
+ bool STTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt);
+ bool LDTRSW(Imm<9> imm9, Reg Rn, Reg Rt);
+
+ // Loads and stores - Atomic memory options
+ bool LDADDB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDCLRB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDEORB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSETB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMAXB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMINB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMAXB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMINB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool SWPB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDAPRB(Reg Rn, Reg Rt);
+ bool LDADDH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDCLRH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDEORH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSETH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMAXH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMINH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMAXH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMINH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool SWPH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDAPRH(Reg Rn, Reg Rt);
+ bool LDADD(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDCLR(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDEOR(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSET(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMAX(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDSMIN(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMAX(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDUMIN(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool SWP(bool A, bool R, Reg Rs, Reg Rn, Reg Rt);
+ bool LDAPR(Reg Rn, Reg Rt);
+
+ // Loads and stores - Load/Store register (register offset)
+ bool STRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt);
+ bool LDRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt);
+ bool STR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt);
+ bool LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt);
+
+ // Loads and stores - Load/Store memory tags
+ bool STG_1(Imm<9> imm9, Reg Rn);
+ bool STG_2(Imm<9> imm9, Reg Rn);
+ bool STG_3(Imm<9> imm9, Reg Rn);
+ bool LDG(Imm<9> offset_imm, Reg Rn, Reg Rt);
+ bool STZG_1(Imm<9> offset_imm, Reg Rn);
+ bool STZG_2(Imm<9> offset_imm, Reg Rn);
+ bool STZG_3(Imm<9> offset_imm, Reg Rn);
+ bool ST2G_1(Imm<9> offset_imm, Reg Rn);
+ bool ST2G_2(Imm<9> offset_imm, Reg Rn);
+ bool ST2G_3(Imm<9> offset_imm, Reg Rn);
+ bool STGV(Reg Rn, Reg Rt);
+ bool STZ2G_1(Imm<9> offset_imm, Reg Rn);
+ bool STZ2G_2(Imm<9> offset_imm, Reg Rn);
+ bool STZ2G_3(Imm<9> offset_imm, Reg Rn);
+ bool LDGV(Reg Rn, Reg Rt);
+
+ // Loads and stores - Load/Store register (pointer authentication)
+ bool LDRA(bool M, bool S, Imm<9> imm9, bool W, Reg Rn, Reg Rt);
+
+ // Data Processing - Register - 2 source
+ bool UDIV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool SDIV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool LSLV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool LSRV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool ASRV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool RORV(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool CRC32(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd);
+ bool CRC32C(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd);
+ bool PACGA(Reg Rm, Reg Rn, Reg Rd);
+ bool SUBP(Reg Rm, Reg Rn, Reg Rd);
+ bool IRG(Reg Rm, Reg Rn, Reg Rd);
+ bool GMI(Reg Rm, Reg Rn, Reg Rd);
+ bool SUBPS(Reg Rm, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - 1 source
+ bool RBIT_int(bool sf, Reg Rn, Reg Rd);
+ bool REV16_int(bool sf, Reg Rn, Reg Rd);
+ bool REV(bool sf, bool opc_0, Reg Rn, Reg Rd);
+ bool CLZ_int(bool sf, Reg Rn, Reg Rd);
+ bool CLS_int(bool sf, Reg Rn, Reg Rd);
+ bool REV32_int(Reg Rn, Reg Rd);
+ bool PACDA(bool Z, Reg Rn, Reg Rd);
+ bool PACDB(bool Z, Reg Rn, Reg Rd);
+ bool AUTDA(bool Z, Reg Rn, Reg Rd);
+ bool AUTDB(bool Z, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - Logical (shifted register)
+ bool AND_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool BIC_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool ORR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool ORN_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool EOR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool EON(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool ANDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool BICS(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - Add/Sub (shifted register)
+ bool ADD_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool ADDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool SUB_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+ bool SUBS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - Add/Sub (shifted register)
+ bool ADD_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd);
+ bool ADDS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd);
+ bool SUB_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd);
+ bool SUBS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - Add/Sub (with carry)
+ bool ADC(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool ADCS(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool SBC(bool sf, Reg Rm, Reg Rn, Reg Rd);
+ bool SBCS(bool sf, Reg Rm, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - Conditional compare
+ bool CCMN_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv);
+ bool CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv);
+ bool CCMN_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv);
+ bool CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv);
+
+ // Data Processing - Register - Conditional select
+ bool CSEL(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd);
+ bool CSINC(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd);
+ bool CSINV(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd);
+ bool CSNEG(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd);
+
+ // Data Processing - Register - 3 source
+ bool MADD(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool MSUB(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool SMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool SMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool SMULH(Reg Rm, Reg Rn, Reg Rd);
+ bool UMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool UMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd);
+ bool UMULH(Reg Rm, Reg Rn, Reg Rd);
+
+ // Data Processing - FP and SIMD - AES
+ bool AESE(Vec Vn, Vec Vd);
+ bool AESD(Vec Vn, Vec Vd);
+ bool AESMC(Vec Vn, Vec Vd);
+ bool AESIMC(Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SHA
+ bool SHA1C(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA1P(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA1M(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA1SU0(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA256H(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA256H2(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA256SU1(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA1H(Vec Vn, Vec Vd);
+ bool SHA1SU1(Vec Vn, Vec Vd);
+ bool SHA256SU0(Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar copy
+ bool DUP_elt_1(Imm<5> imm5, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar three
+ bool FMULX_vec_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FMULX_vec_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMEQ_reg_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FCMEQ_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FRECPS_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FRECPS_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FRSQRTS_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FRSQRTS_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGE_reg_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGE_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGE_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FACGE_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FABD_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FABD_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGT_reg_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGT_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGT_1(Vec Vm, Vec Vn, Vec Vd);
+ bool FACGT_2(bool sz, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Two register misc FP16
+ bool FCVTNS_1(Vec Vn, Vec Vd);
+ bool FCVTMS_1(Vec Vn, Vec Vd);
+ bool FCVTAS_1(Vec Vn, Vec Vd);
+ bool SCVTF_int_1(Vec Vn, Vec Vd);
+ bool FCMGT_zero_1(Vec Vn, Vec Vd);
+ bool FCMEQ_zero_1(Vec Vn, Vec Vd);
+ bool FCMLT_1(Vec Vn, Vec Vd);
+ bool FCVTPS_1(Vec Vn, Vec Vd);
+ bool FCVTZS_int_1(Vec Vn, Vec Vd);
+ bool FRECPE_1(Vec Vn, Vec Vd);
+ bool FRECPX_1(Vec Vn, Vec Vd);
+ bool FCVTNU_1(Vec Vn, Vec Vd);
+ bool FCVTMU_1(Vec Vn, Vec Vd);
+ bool FCVTAU_1(Vec Vn, Vec Vd);
+ bool UCVTF_int_1(Vec Vn, Vec Vd);
+ bool FCMGE_zero_1(Vec Vn, Vec Vd);
+ bool FCMLE_1(Vec Vn, Vec Vd);
+ bool FCVTPU_1(Vec Vn, Vec Vd);
+ bool FCVTZU_int_1(Vec Vn, Vec Vd);
+ bool FRSQRTE_1(Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Two register misc
+ bool FCVTNS_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTMS_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTAS_2(bool sz, Vec Vn, Vec Vd);
+ bool SCVTF_int_2(bool sz, Vec Vn, Vec Vd);
+ bool FCMGT_zero_2(bool sz, Vec Vn, Vec Vd);
+ bool FCMEQ_zero_2(bool sz, Vec Vn, Vec Vd);
+ bool FCMLT_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTPS_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTZS_int_2(bool sz, Vec Vn, Vec Vd);
+ bool FRECPE_2(bool sz, Vec Vn, Vec Vd);
+ bool FRECPX_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTNU_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTMU_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTAU_2(bool sz, Vec Vn, Vec Vd);
+ bool UCVTF_int_2(bool sz, Vec Vn, Vec Vd);
+ bool FCMGE_zero_2(bool sz, Vec Vn, Vec Vd);
+ bool FCMLE_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTPU_2(bool sz, Vec Vn, Vec Vd);
+ bool FCVTZU_int_2(bool sz, Vec Vn, Vec Vd);
+ bool FRSQRTE_2(bool sz, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar two register misc FP16
+ bool FCVTNS_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTMS_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTAS_3(bool Q, Vec Vn, Vec Vd);
+ bool SCVTF_int_3(bool Q, Vec Vn, Vec Vd);
+ bool FCMGT_zero_3(bool Q, Vec Vn, Vec Vd);
+ bool FCMEQ_zero_3(bool Q, Vec Vn, Vec Vd);
+ bool FCMLT_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTPS_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTZS_int_3(bool Q, Vec Vn, Vec Vd);
+ bool FRECPE_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTNU_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTMU_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTAU_3(bool Q, Vec Vn, Vec Vd);
+ bool UCVTF_int_3(bool Q, Vec Vn, Vec Vd);
+ bool FCMGE_zero_3(bool Q, Vec Vn, Vec Vd);
+ bool FCMLE_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTPU_3(bool Q, Vec Vn, Vec Vd);
+ bool FCVTZU_int_3(bool Q, Vec Vn, Vec Vd);
+ bool FRSQRTE_3(bool Q, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar two register misc
+ bool FCVTNS_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTMS_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTAS_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool SCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCMGT_zero_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCMEQ_zero_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCMLT_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTPS_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTZS_int_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRECPE_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTNU_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTMU_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTAU_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool UCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCMGE_zero_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCMLE_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTPU_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTZU_int_4(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRSQRTE_4(bool Q, bool sz, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar three same extra
+ bool SQRDMLAH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRDMLAH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRDMLSH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRDMLSH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Scalar two-register misc
+ bool SUQADD_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool SQABS_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool CMLT_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool ABS_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool SQXTN_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool USQADD_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool SQNEG_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool CMLE_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool NEG_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool SQXTUN_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool UQXTN_1(Imm<2> size, Vec Vn, Vec Vd);
+ bool FCVTXN_1(bool sz, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Scalar pairwise
+ bool ADDP_pair(Imm<2> size, Vec Vn, Vec Vd);
+ bool FMAXNMP_pair_1(Vec Vn, Vec Vd);
+ bool FMAXNMP_pair_2(bool sz, Vec Vn, Vec Vd);
+ bool FADDP_pair_1(Vec Vn, Vec Vd);
+ bool FADDP_pair_2(bool sz, Vec Vn, Vec Vd);
+ bool FMAXP_pair_1(Vec Vn, Vec Vd);
+ bool FMAXP_pair_2(bool sz, Vec Vn, Vec Vd);
+ bool FMINNMP_pair_1(Vec Vn, Vec Vd);
+ bool FMINNMP_pair_2(bool sz, Vec Vn, Vec Vd);
+ bool FMINP_pair_1(Vec Vn, Vec Vd);
+ bool FMINP_pair_2(bool sz, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Scalar three different
+ bool SQDMLAL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd);
+ bool SQDMLSL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd);
+ bool SQDMULL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Scalar three same
+ bool SQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMGT_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMGE_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMTST_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMHI_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMHS_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool USHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool URSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMEQ_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Scalar shift by immediate
+ bool SSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SHL_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQRSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool FCVTZS_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool USHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool USRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool URSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool URSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SLI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHLU_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQRSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQRSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool FCVTZU_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Scalar x indexed element
+ bool SQDMLAL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMLSL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMULL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLA_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLS_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMUL_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMUL_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMLAH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMLSH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMULX_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMULX_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Table Lookup
+ bool TBL(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd);
+ bool TBX(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Permute
+ bool UZP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool TRN1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ZIP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UZP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool TRN2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ZIP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Extract
+ bool EXT(bool Q, Vec Vm, Imm<4> imm4, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Copy
+ bool DUP_elt_2(bool Q, Imm<5> imm5, Vec Vn, Vec Vd);
+ bool DUP_gen(bool Q, Imm<5> imm5, Reg Rn, Vec Vd);
+ bool SMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd);
+ bool UMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd);
+ bool INS_gen(Imm<5> imm5, Reg Rn, Vec Vd);
+ bool INS_elt(Imm<5> imm5, Imm<4> imm4, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Three same
+ bool FMULX_vec_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMEQ_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FRECPS_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FRSQRTS_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGE_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGE_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FABD_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGT_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGT_3(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXNM_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLA_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FADD_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAX_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINNM_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLS_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FSUB_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMIN_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FADDP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMUL_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FDIV_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Three same extra
+ bool SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMLA_vec(bool Q, Imm<2> size, Vec Vm, Imm<2> rot, Vec Vn, Vec Vd);
+ bool FCADD_vec(bool Q, Imm<2> size, Vec Vm, Imm<1> rot, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Two register misc
+ bool REV64_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool REV16_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CLS_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CNT(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool XTN(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool FCVTN(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FCVTL(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool URECPE(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool REV32_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool UADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CLZ_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool UADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SHLL(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool NOT(bool Q, Vec Vn, Vec Vd);
+ bool RBIT_asimd(bool Q, Vec Vn, Vec Vd);
+ bool URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool FCVTXN_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTN_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTM_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FABS_1(bool Q, Vec Vn, Vec Vd);
+ bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTP_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTZ_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTA_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTX_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FNEG_1(bool Q, Vec Vn, Vec Vd);
+ bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINTI_1(bool Q, Vec Vn, Vec Vd);
+ bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FSQRT_1(bool Q, Vec Vn, Vec Vd);
+ bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINT32X_1(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINT64X_1(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINT32Z_1(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FRINT64Z_1(bool Q, bool sz, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD across lanes
+ bool SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool SMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool FMAXNMV_1(bool Q, Vec Vn, Vec Vd);
+ bool FMAXNMV_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FMAXV_1(bool Q, Vec Vn, Vec Vd);
+ bool FMAXV_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FMINNMV_1(bool Q, Vec Vn, Vec Vd);
+ bool FMINNMV_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool FMINV_1(bool Q, Vec Vn, Vec Vd);
+ bool FMINV_2(bool Q, bool sz, Vec Vn, Vec Vd);
+ bool UADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool UMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+ bool UMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD three different
+ bool SADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SSUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SSUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool PMULL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool USUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool USUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool RADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool RSUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQDMLAL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQDMLSL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQDMULL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD three same
+ bool SHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SRHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool MLA_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool MUL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ADDP_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLAL_vec_1(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLAL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool AND_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool BIC_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLSL_vec_1(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLSL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool ORR_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool ORN_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool UHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool URHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool MLS_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool PMUL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool EOR_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool BSL(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool BIT(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool BIF(bool Q, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FADD_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAX_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FSUB_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMIN_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FADDP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMUL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FDIV_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FMULX_vec_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMEQ_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FRECPS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FRSQRTS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGE_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGE_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FABD_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FCMGT_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool FACGT_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd);
+ bool SQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMGT_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMGE_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool ADD_vector(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMTST_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMHI_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMHS_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool USHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool URSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool UQRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool CMEQ_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+ bool SQRDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD modified immediate
+ bool MOVI(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<4> cmode, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd);
+ bool FMOV_2(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd);
+ bool FMOV_3(bool Q, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD Shift by immediate
+ bool SSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SHL_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool RSHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SSHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool FCVTZS_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool USHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool USRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool URSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool URSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SRI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SLI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHLU_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool SQRSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool USHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool UCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+ bool FCVTZU_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SIMD vector x indexed element
+ bool SMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMLAL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMLSL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vm, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMULL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLA_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLS_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMUL_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMUL_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLAL_elt_1(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLAL_elt_2(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLSL_elt_1(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMLSL_elt_2(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool MLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool UMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool MLS_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool UMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool UMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMLAH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool UDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool SQRDMLSH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMULX_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FMULX_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd);
+ bool FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Cryptographic three register
+ bool SM3TT1A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd);
+ bool SM3TT1B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd);
+ bool SM3TT2A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd);
+ bool SM3TT2B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SHA512 three register
+ bool SHA512H(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA512H2(Vec Vm, Vec Vn, Vec Vd);
+ bool SHA512SU1(Vec Vm, Vec Vn, Vec Vd);
+ bool RAX1(Vec Vm, Vec Vn, Vec Vd);
+ bool XAR(Vec Vm, Imm<6> imm6, Vec Vn, Vec Vd);
+ bool SM3PARTW1(Vec Vm, Vec Vn, Vec Vd);
+ bool SM3PARTW2(Vec Vm, Vec Vn, Vec Vd);
+ bool SM4EKEY(Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Cryptographic four register
+ bool EOR3(Vec Vm, Vec Va, Vec Vn, Vec Vd);
+ bool BCAX(Vec Vm, Vec Va, Vec Vn, Vec Vd);
+ bool SM3SS1(Vec Vm, Vec Va, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - SHA512 two register
+ bool SHA512SU0(Vec Vn, Vec Vd);
+ bool SM4E(Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Conversion between floating point and fixed point
+ bool SCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd);
+ bool UCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd);
+ bool FCVTZS_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd);
+ bool FCVTZU_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd);
+
+ // Data Processing - FP and SIMD - Conversion between floating point and integer
+ bool FCVTNS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTNU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool SCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd);
+ bool UCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd);
+ bool FCVTAS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTAU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FMOV_float_gen(bool sf, Imm<2> type, Imm<1> rmode_0, Imm<1> opc_0, size_t n, size_t d);
+ bool FCVTPS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTPU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTMS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTMU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTZS_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FCVTZU_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd);
+ bool FJCVTZS(Vec Vn, Reg Rd);
+
+ // Data Processing - FP and SIMD - Floating point data processing
+ bool FMOV_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FABS_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FNEG_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FSQRT_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FCVT_float(Imm<2> type, Imm<2> opc, Vec Vn, Vec Vd);
+ bool FRINTN_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTP_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTM_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTZ_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTA_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTX_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINTI_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINT32X_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINT64X_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINT32Z_float(Imm<2> type, Vec Vn, Vec Vd);
+ bool FRINT64Z_float(Imm<2> type, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Floating point compare
+ bool FCMP_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero);
+ bool FCMPE_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero);
+
+ // Data Processing - FP and SIMD - Floating point immediate
+ bool FMOV_float_imm(Imm<2> type, Imm<8> imm8, Vec Vd);
+
+ // Data Processing - FP and SIMD - Floating point conditional compare
+ bool FCCMP_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv);
+ bool FCCMPE_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv);
+
+ // Data Processing - FP and SIMD - Floating point data processing two register
+ bool FMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FDIV_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FADD_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FSUB_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAX_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FMIN_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FMAXNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FMINNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+ bool FNMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Floating point conditional select
+ bool FCSEL_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Vec Vd);
+
+ // Data Processing - FP and SIMD - Floating point data processing three register
+ bool FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd);
+ bool FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd);
+ bool FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd);
+ bool FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd);
+};
+
+inline std::optional<size_t> FPGetDataSize(Imm<2> type) {
+ switch (type.ZeroExtend()) {
+ case 0b00:
+ return 32;
+ case 0b01:
+ return 64;
+ case 0b11:
+ return 16;
+ }
+ return std::nullopt;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp
new file mode 100644
index 0000000000..0e67e11b4b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp
@@ -0,0 +1,209 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& v, bool pair, size_t size, bool L, bool o0, std::optional<Reg> Rs, std::optional<Reg> Rt2, Reg Rn, Reg Rt) {
+ // Shared Decode
+
+ const auto acctype = o0 ? IR::AccType::ORDERED : IR::AccType::ATOMIC;
+ const auto memop = L ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ const size_t elsize = 8 << size;
+ const size_t regsize = elsize == 64 ? 64 : 32;
+ const size_t datasize = pair ? elsize * 2 : elsize;
+
+ // Operation
+
+ const size_t dbytes = datasize / 8;
+
+ if (memop == IR::MemOp::LOAD && pair && Rt == *Rt2) {
+ return v.UnpredictableInstruction();
+ } else if (memop == IR::MemOp::STORE && (*Rs == Rt || (pair && *Rs == *Rt2))) {
+ if (!v.options.define_unpredictable_behaviour) {
+ return v.UnpredictableInstruction();
+ }
+ // UNPREDICTABLE: The Constraint_NONE case is executed.
+ } else if (memop == IR::MemOp::STORE && *Rs == Rn && Rn != Reg::R31) {
+ return v.UnpredictableInstruction();
+ }
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ IR::UAnyU128 data;
+ if (pair && elsize == 64) {
+ data = v.ir.Pack2x64To1x128(v.X(64, Rt), v.X(64, *Rt2));
+ } else if (pair && elsize == 32) {
+ data = v.ir.Pack2x32To1x64(v.X(32, Rt), v.X(32, *Rt2));
+ } else {
+ data = v.X(elsize, Rt);
+ }
+ const IR::U32 status = v.ExclusiveMem(address, dbytes, acctype, data);
+ v.X(32, *Rs, status);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const IR::UAnyU128 data = v.ExclusiveMem(address, dbytes, acctype);
+ if (pair && elsize == 64) {
+ v.X(64, Rt, v.ir.VectorGetElement(64, data, 0));
+ v.X(64, *Rt2, v.ir.VectorGetElement(64, data, 1));
+ } else if (pair && elsize == 32) {
+ v.X(32, Rt, v.ir.LeastSignificantWord(data));
+ v.X(32, *Rt2, v.ir.MostSignificantWord(data).result);
+ } else {
+ v.X(regsize, Rt, v.ZeroExtend(data, regsize));
+ }
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) {
+ const bool pair = false;
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 0;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, {}, Rn, Rt);
+}
+
+bool TranslatorVisitor::STLXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) {
+ const bool pair = false;
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 1;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, {}, Rn, Rt);
+}
+
+bool TranslatorVisitor::STXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) {
+ const bool pair = true;
+ const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 0;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, Rt2, Rn, Rt);
+}
+
+bool TranslatorVisitor::STLXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) {
+ const bool pair = true;
+ const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 1;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, Rt2, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDXR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const bool pair = false;
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 0;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, {}, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDAXR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const bool pair = false;
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 1;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, {}, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) {
+ const bool pair = true;
+ const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 0;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, Rt2, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDAXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) {
+ const bool pair = true;
+ const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 1;
+ return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, Rt2, Rn, Rt);
+}
+
+static bool OrderedSharedDecodeAndOperation(TranslatorVisitor& v, size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
+ // Shared Decode
+
+ const auto acctype = !o0 ? IR::AccType::LIMITEDORDERED : IR::AccType::ORDERED;
+ const auto memop = L ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ const size_t elsize = 8 << size;
+ const size_t regsize = elsize == 64 ? 64 : 32;
+ const size_t datasize = elsize;
+
+ // Operation
+
+ const size_t dbytes = datasize / 8;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ const IR::UAny data = v.X(datasize, Rt);
+ v.Mem(address, dbytes, acctype, data);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const IR::UAny data = v.Mem(address, dbytes, acctype);
+ v.X(regsize, Rt, v.ZeroExtend(data, regsize));
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STLLR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 0;
+ return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt);
+}
+
+bool TranslatorVisitor::STLR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 0;
+ const bool o0 = 1;
+ return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDLAR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 0;
+ return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDAR(Imm<2> sz, Reg Rn, Reg Rt) {
+ const size_t size = sz.ZeroExtend<size_t>();
+ const bool L = 1;
+ const bool o0 = 1;
+ return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp
new file mode 100644
index 0000000000..995aba4252
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp
@@ -0,0 +1,55 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::LDR_lit_gen(bool opc_0, Imm<19> imm19, Reg Rt) {
+ const size_t size = opc_0 == 0 ? 4 : 8;
+ const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+
+ const u64 address = ir.PC() + offset;
+ const auto data = Mem(ir.Imm64(address), size, IR::AccType::NORMAL);
+
+ X(8 * size, Rt, data);
+ return true;
+}
+
+bool TranslatorVisitor::LDR_lit_fpsimd(Imm<2> opc, Imm<19> imm19, Vec Vt) {
+ if (opc == 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ const u64 size = 4 << opc.ZeroExtend();
+ const u64 offset = imm19.SignExtend<u64>() << 2;
+ const IR::U64 address = ir.Imm64(ir.PC() + offset);
+ const IR::UAnyU128 data = Mem(address, size, IR::AccType::VEC);
+
+ if (size == 16) {
+ V(128, Vt, data);
+ } else {
+ V(128, Vt, ir.ZeroExtendToQuad(data));
+ }
+ return true;
+}
+
+bool TranslatorVisitor::LDRSW_lit(Imm<19> imm19, Reg Rt) {
+ const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+ const u64 address = ir.PC() + offset;
+ const auto data = Mem(ir.Imm64(address), 4, IR::AccType::NORMAL);
+
+ X(64, Rt, ir.SignExtendWordToLong(data));
+ return true;
+}
+
+bool TranslatorVisitor::PRFM_lit(Imm<19> /*imm19*/, Imm<5> /*prfop*/) {
+ // s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>();
+ // u64 address = ir.PC() + offset;
+ // Prefetch(address, prfop);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp
new file mode 100644
index 0000000000..77c57a9659
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp
@@ -0,0 +1,134 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp memop, bool Q, std::optional<Reg> Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t elements = datasize / esize;
+ const size_t ebytes = esize / 8;
+
+ size_t rpt, selem;
+ switch (opcode.ZeroExtend()) {
+ case 0b0000:
+ rpt = 1;
+ selem = 4;
+ break;
+ case 0b0010:
+ rpt = 4;
+ selem = 1;
+ break;
+ case 0b0100:
+ rpt = 1;
+ selem = 3;
+ break;
+ case 0b0110:
+ rpt = 3;
+ selem = 1;
+ break;
+ case 0b0111:
+ rpt = 1;
+ selem = 1;
+ break;
+ case 0b1000:
+ rpt = 1;
+ selem = 2;
+ break;
+ case 0b1010:
+ rpt = 2;
+ selem = 1;
+ break;
+ default:
+ return v.UnallocatedEncoding();
+ }
+ ASSERT(rpt == 1 || selem == 1);
+
+ if ((size == 0b11 && !Q) && selem != 1) {
+ return v.ReservedValue();
+ }
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+
+ IR::U64 offs = v.ir.Imm64(0);
+ if (selem == 1) {
+ for (size_t r = 0; r < rpt; r++) {
+ const Vec tt = static_cast<Vec>((VecNumber(Vt) + r) % 32);
+ if (memop == IR::MemOp::LOAD) {
+ const IR::UAnyU128 vec = v.Mem(v.ir.Add(address, offs), ebytes * elements, IR::AccType::VEC);
+ v.V_scalar(datasize, tt, vec);
+ } else {
+ const IR::UAnyU128 vec = v.V_scalar(datasize, tt);
+ v.Mem(v.ir.Add(address, offs), ebytes * elements, IR::AccType::VEC, vec);
+ }
+ offs = v.ir.Add(offs, v.ir.Imm64(ebytes * elements));
+ }
+ } else {
+ for (size_t e = 0; e < elements; e++) {
+ for (size_t s = 0; s < selem; s++) {
+ const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
+ if (memop == IR::MemOp::LOAD) {
+ const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
+ const IR::U128 vec = v.ir.VectorSetElement(esize, v.V(datasize, tt), e, elem);
+ v.V(datasize, tt, vec);
+ } else {
+ const IR::UAny elem = v.ir.VectorGetElement(esize, v.V(datasize, tt), e);
+ v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem);
+ }
+ offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
+ }
+ }
+ }
+
+ if (wback) {
+ if (*Rm != Reg::SP) {
+ offs = v.X(64, *Rm);
+ }
+
+ if (Rn == Reg::SP) {
+ v.SP(64, v.ir.Add(address, offs));
+ } else {
+ v.X(64, Rn, v.ir.Add(address, offs));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const bool wback = false;
+ const auto memop = IR::MemOp::STORE;
+ return SharedDecodeAndOperation(*this, wback, memop, Q, {}, opcode, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::STx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const bool wback = true;
+ const auto memop = IR::MemOp::STORE;
+ return SharedDecodeAndOperation(*this, wback, memop, Q, Rm, opcode, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const bool wback = false;
+ const auto memop = IR::MemOp::LOAD;
+ return SharedDecodeAndOperation(*this, wback, memop, Q, {}, opcode, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const bool wback = true;
+ const auto memop = IR::MemOp::LOAD;
+ return SharedDecodeAndOperation(*this, wback, memop, Q, Rm, opcode, size, Rn, Vt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp
new file mode 100644
index 0000000000..c85cc55241
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp
@@ -0,0 +1,22 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+// Given the LDNP and STNP instructions are simply hinting at non-temporal
+// accesses, we can treat them as regular LDP and STP instructions for the
+// time being since we don't perform data caching.
+
+bool TranslatorVisitor::STNP_LDNP_gen(Imm<1> upper_opc, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt) {
+ return STP_LDP_gen(Imm<2>{upper_opc.ZeroExtend() << 1}, true, false, L, imm7, Rt2, Rn, Rt);
+}
+
+bool TranslatorVisitor::STNP_LDNP_fpsimd(Imm<2> opc, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt) {
+ return STP_LDP_fpsimd(opc, true, false, L, imm7, Vt2, Rn, Vt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp
new file mode 100644
index 0000000000..d939c45f2c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp
@@ -0,0 +1,249 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool LoadStoreRegisterImmediate(TranslatorVisitor& v, bool wback, bool postindex, size_t scale, u64 offset, Imm<2> size, Imm<2> opc, Reg Rn, Reg Rt) {
+ IR::MemOp memop;
+ bool signed_ = false;
+ size_t regsize = 0;
+
+ if (opc.Bit<1>() == 0) {
+ memop = opc.Bit<0>() ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ regsize = size == 0b11 ? 64 : 32;
+ signed_ = false;
+ } else if (size == 0b11) {
+ memop = IR::MemOp::PREFETCH;
+ ASSERT(!opc.Bit<0>());
+ } else {
+ memop = IR::MemOp::LOAD;
+ ASSERT(!(size == 0b10 && opc.Bit<0>() == 1));
+ regsize = opc.Bit<0>() ? 32 : 64;
+ signed_ = true;
+ }
+
+ if (memop == IR::MemOp::LOAD && wback && Rn == Rt && Rn != Reg::R31) {
+ return v.UnpredictableInstruction();
+ }
+ if (memop == IR::MemOp::STORE && wback && Rn == Rt && Rn != Reg::R31) {
+ return v.UnpredictableInstruction();
+ }
+
+ // TODO: Check SP alignment
+ IR::U64 address = Rn == Reg::SP ? IR::U64(v.SP(64)) : IR::U64(v.X(64, Rn));
+ if (!postindex) {
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+ }
+
+ const size_t datasize = 8 << scale;
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ const auto data = v.X(datasize, Rt);
+ v.Mem(address, datasize / 8, IR::AccType::NORMAL, data);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const auto data = v.Mem(address, datasize / 8, IR::AccType::NORMAL);
+ if (signed_) {
+ v.X(regsize, Rt, v.SignExtend(data, regsize));
+ } else {
+ v.X(regsize, Rt, v.ZeroExtend(data, regsize));
+ }
+ break;
+ }
+ case IR::MemOp::PREFETCH:
+ // Prefetch(address, Rt)
+ break;
+ }
+
+ if (wback) {
+ if (postindex) {
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+ }
+
+ if (Rn == Reg::SP) {
+ v.SP(64, address);
+ } else {
+ v.X(64, Rn, address);
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STRx_LDRx_imm_1(Imm<2> size, Imm<2> opc, Imm<9> imm9, bool not_postindex, Reg Rn, Reg Rt) {
+ const bool wback = true;
+ const bool postindex = !not_postindex;
+ const size_t scale = size.ZeroExtend<size_t>();
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt);
+}
+
+bool TranslatorVisitor::STRx_LDRx_imm_2(Imm<2> size, Imm<2> opc, Imm<12> imm12, Reg Rn, Reg Rt) {
+ const bool wback = false;
+ const bool postindex = false;
+ const size_t scale = size.ZeroExtend<size_t>();
+ const u64 offset = imm12.ZeroExtend<u64>() << scale;
+
+ return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt);
+}
+
+bool TranslatorVisitor::STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) {
+ const bool wback = false;
+ const bool postindex = false;
+ const size_t scale = size.ZeroExtend<size_t>();
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt);
+}
+
+bool TranslatorVisitor::PRFM_imm([[maybe_unused]] Imm<12> imm12, [[maybe_unused]] Reg Rn, [[maybe_unused]] Reg Rt) {
+ // Currently a NOP (which is valid behavior, as indicated by
+ // the ARMv8 architecture reference manual)
+ return true;
+}
+
+bool TranslatorVisitor::PRFM_unscaled_imm([[maybe_unused]] Imm<9> imm9, [[maybe_unused]] Reg Rn, [[maybe_unused]] Reg Rt) {
+ // Currently a NOP (which is valid behavior, as indicated by
+ // the ARMv8 architecture reference manual)
+ return true;
+}
+
+static bool LoadStoreSIMD(TranslatorVisitor& v, bool wback, bool postindex, size_t scale, u64 offset, IR::MemOp memop, Reg Rn, Vec Vt) {
+ const auto acctype = IR::AccType::VEC;
+ const size_t datasize = 8 << scale;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+
+ if (!postindex) {
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+ }
+
+ switch (memop) {
+ case IR::MemOp::STORE:
+ if (datasize == 128) {
+ const IR::U128 data = v.V(128, Vt);
+ v.Mem(address, 16, acctype, data);
+ } else {
+ const IR::UAny data = v.ir.VectorGetElement(datasize, v.V(128, Vt), 0);
+ v.Mem(address, datasize / 8, acctype, data);
+ }
+ break;
+ case IR::MemOp::LOAD:
+ if (datasize == 128) {
+ const IR::U128 data = v.Mem(address, 16, acctype);
+ v.V(128, Vt, data);
+ } else {
+ const IR::UAny data = v.Mem(address, datasize / 8, acctype);
+ v.V(128, Vt, v.ir.ZeroExtendToQuad(data));
+ }
+ break;
+ default:
+ UNREACHABLE();
+ }
+
+ if (wback) {
+ if (postindex) {
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+ }
+
+ if (Rn == Reg::SP) {
+ v.SP(64, address);
+ } else {
+ v.X(64, Rn, address);
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = true;
+ const bool postindex = !not_postindex;
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt);
+}
+
+bool TranslatorVisitor::STR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = false;
+ const bool postindex = false;
+ const u64 offset = imm12.ZeroExtend<u64>() << scale;
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = true;
+ const bool postindex = !not_postindex;
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = false;
+ const bool postindex = false;
+ const u64 offset = imm12.ZeroExtend<u64>() << scale;
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt);
+}
+
+bool TranslatorVisitor::STUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = false;
+ const bool postindex = false;
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt) {
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+
+ const bool wback = false;
+ const bool postindex = false;
+ const u64 offset = imm9.SignExtend<u64>();
+
+ return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp
new file mode 100644
index 0000000000..4c81564f7c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp
@@ -0,0 +1,154 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::STP_LDP_gen(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt) {
+ if ((L == 0 && opc.Bit<0>() == 1) || opc == 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ const auto memop = L == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ if (memop == IR::MemOp::LOAD && wback && (Rt == Rn || Rt2 == Rn) && Rn != Reg::R31) {
+ return UnpredictableInstruction();
+ }
+ if (memop == IR::MemOp::STORE && wback && (Rt == Rn || Rt2 == Rn) && Rn != Reg::R31) {
+ return UnpredictableInstruction();
+ }
+ if (memop == IR::MemOp::LOAD && Rt == Rt2) {
+ return UnpredictableInstruction();
+ }
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = SP(64);
+ } else {
+ address = X(64, Rn);
+ }
+
+ const bool postindex = !not_postindex;
+ const bool signed_ = opc.Bit<0>() != 0;
+ const size_t scale = 2 + opc.Bit<1>();
+ const size_t datasize = 8 << scale;
+ const u64 offset = imm7.SignExtend<u64>() << scale;
+
+ if (!postindex) {
+ address = ir.Add(address, ir.Imm64(offset));
+ }
+
+ const size_t dbytes = datasize / 8;
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ const IR::U32U64 data1 = X(datasize, Rt);
+ const IR::U32U64 data2 = X(datasize, Rt2);
+ Mem(address, dbytes, IR::AccType::NORMAL, data1);
+ Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::NORMAL, data2);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const IR::U32U64 data1 = Mem(address, dbytes, IR::AccType::NORMAL);
+ const IR::U32U64 data2 = Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::NORMAL);
+ if (signed_) {
+ X(64, Rt, SignExtend(data1, 64));
+ X(64, Rt2, SignExtend(data2, 64));
+ } else {
+ X(datasize, Rt, data1);
+ X(datasize, Rt2, data2);
+ }
+ break;
+ }
+ case IR::MemOp::PREFETCH:
+ UNREACHABLE();
+ }
+
+ if (wback) {
+ if (postindex) {
+ address = ir.Add(address, ir.Imm64(offset));
+ }
+
+ if (Rn == Reg::SP) {
+ SP(64, address);
+ } else {
+ X(64, Rn, address);
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STP_LDP_fpsimd(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt) {
+ if (opc == 0b11) {
+ return UnallocatedEncoding();
+ }
+
+ const auto memop = L == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ if (memop == IR::MemOp::LOAD && Vt == Vt2) {
+ return UnpredictableInstruction();
+ }
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = SP(64);
+ } else {
+ address = X(64, Rn);
+ }
+
+ const bool postindex = !not_postindex;
+ const size_t scale = 2 + opc.ZeroExtend<size_t>();
+ const size_t datasize = 8 << scale;
+ const u64 offset = imm7.SignExtend<u64>() << scale;
+ const size_t dbytes = datasize / 8;
+
+ if (!postindex) {
+ address = ir.Add(address, ir.Imm64(offset));
+ }
+
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ IR::UAnyU128 data1 = V(datasize, Vt);
+ IR::UAnyU128 data2 = V(datasize, Vt2);
+ if (datasize != 128) {
+ data1 = ir.VectorGetElement(datasize, data1, 0);
+ data2 = ir.VectorGetElement(datasize, data2, 0);
+ }
+ Mem(address, dbytes, IR::AccType::VEC, data1);
+ Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::VEC, data2);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ IR::UAnyU128 data1 = Mem(address, dbytes, IR::AccType::VEC);
+ IR::UAnyU128 data2 = Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::VEC);
+ if (datasize != 128) {
+ data1 = ir.ZeroExtendToQuad(data1);
+ data2 = ir.ZeroExtendToQuad(data2);
+ }
+ V(datasize, Vt, data1);
+ V(datasize, Vt2, data2);
+ break;
+ }
+ case IR::MemOp::PREFETCH:
+ UNREACHABLE();
+ }
+
+ if (wback) {
+ if (postindex) {
+ address = ir.Add(address, ir.Imm64(offset));
+ }
+
+ if (Rn == Reg::SP) {
+ SP(64, address);
+ } else {
+ X(64, Rn, address);
+ }
+ }
+
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp
new file mode 100644
index 0000000000..9791095868
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp
@@ -0,0 +1,160 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool RegSharedDecodeAndOperation(TranslatorVisitor& v, size_t scale, u8 shift, Imm<2> size, Imm<1> opc_1, Imm<1> opc_0, Reg Rm, Imm<3> option, Reg Rn, Reg Rt) {
+ // Shared Decode
+
+ const auto acctype = IR::AccType::NORMAL;
+ IR::MemOp memop;
+ size_t regsize = 64;
+ bool signed_ = false;
+
+ if (opc_1 == 0) {
+ memop = opc_0 == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ regsize = size == 0b11 ? 64 : 32;
+ signed_ = false;
+ } else if (size == 0b11) {
+ memop = IR::MemOp::PREFETCH;
+ if (opc_0 == 1) {
+ return v.UnallocatedEncoding();
+ }
+ } else {
+ memop = IR::MemOp::LOAD;
+ if (size == 0b10 && opc_0 == 1) {
+ return v.UnallocatedEncoding();
+ }
+ regsize = opc_0 == 1 ? 32 : 64;
+ signed_ = true;
+ }
+
+ const size_t datasize = 8 << scale;
+
+ // Operation
+
+ const IR::U64 offset = v.ExtendReg(64, Rm, option, shift);
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+ address = v.ir.Add(address, offset);
+
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ const IR::UAny data = v.X(datasize, Rt);
+ v.Mem(address, datasize / 8, acctype, data);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const IR::UAny data = v.Mem(address, datasize / 8, acctype);
+ if (signed_) {
+ v.X(regsize, Rt, v.SignExtend(data, regsize));
+ } else {
+ v.X(regsize, Rt, v.ZeroExtend(data, regsize));
+ }
+ break;
+ }
+ case IR::MemOp::PREFETCH:
+ // TODO: Prefetch
+ break;
+ default:
+ UNREACHABLE();
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt) {
+ const Imm<1> opc_0{0};
+ const size_t scale = size.ZeroExtend<size_t>();
+ const u8 shift = S ? static_cast<u8>(scale) : 0;
+ if (!option.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+ return RegSharedDecodeAndOperation(*this, scale, shift, size, opc_1, opc_0, Rm, option, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt) {
+ const Imm<1> opc_0{1};
+ const size_t scale = size.ZeroExtend<size_t>();
+ const u8 shift = S ? static_cast<u8>(scale) : 0;
+ if (!option.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+ return RegSharedDecodeAndOperation(*this, scale, shift, size, opc_1, opc_0, Rm, option, Rn, Rt);
+}
+
+static bool VecSharedDecodeAndOperation(TranslatorVisitor& v, size_t scale, u8 shift, Imm<1> opc_0, Reg Rm, Imm<3> option, Reg Rn, Vec Vt) {
+ // Shared Decode
+
+ const auto acctype = IR::AccType::VEC;
+ const auto memop = opc_0 == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ const size_t datasize = 8 << scale;
+
+ // Operation
+
+ const IR::U64 offset = v.ExtendReg(64, Rm, option, shift);
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+ address = v.ir.Add(address, offset);
+
+ switch (memop) {
+ case IR::MemOp::STORE: {
+ const IR::UAnyU128 data = v.V_scalar(datasize, Vt);
+ v.Mem(address, datasize / 8, acctype, data);
+ break;
+ }
+ case IR::MemOp::LOAD: {
+ const IR::UAnyU128 data = v.Mem(address, datasize / 8, acctype);
+ v.V_scalar(datasize, Vt, data);
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::STR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt) {
+ const Imm<1> opc_0{0};
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+ const u8 shift = S ? static_cast<u8>(scale) : 0;
+ if (!option.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+ return VecSharedDecodeAndOperation(*this, scale, shift, opc_0, Rm, option, Rn, Vt);
+}
+
+bool TranslatorVisitor::LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt) {
+ const Imm<1> opc_0{1};
+ const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>();
+ if (scale > 4) {
+ return UnallocatedEncoding();
+ }
+ const u8 shift = S ? static_cast<u8>(scale) : 0;
+ if (!option.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+ return VecSharedDecodeAndOperation(*this, scale, shift, opc_0, Rm, option, Rn, Vt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp
new file mode 100644
index 0000000000..d3f4194ff2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp
@@ -0,0 +1,149 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool StoreRegister(TranslatorVisitor& v, const size_t datasize, const Imm<9> imm9, const Reg Rn, const Reg Rt) {
+ const u64 offset = imm9.SignExtend<u64>();
+ const auto acctype = IR::AccType::UNPRIV;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check Stack Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+
+ const IR::UAny data = v.X(datasize, Rt);
+ v.Mem(address, datasize / 8, acctype, data);
+ return true;
+}
+
+static bool LoadRegister(TranslatorVisitor& v, const size_t datasize, const Imm<9> imm9, const Reg Rn, const Reg Rt) {
+ const u64 offset = imm9.SignExtend<u64>();
+ const auto acctype = IR::AccType::UNPRIV;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check Stack Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+
+ const IR::UAny data = v.Mem(address, datasize / 8, acctype);
+ // max is used to zeroextend < 32 to 32, and > 32 to 64
+ const size_t extended_size = std::max<size_t>(32, datasize);
+ v.X(extended_size, Rt, v.ZeroExtend(data, extended_size));
+ return true;
+}
+
+static bool LoadRegisterSigned(TranslatorVisitor& v, const size_t datasize, const Imm<2> opc, const Imm<9> imm9, const Reg Rn, const Reg Rt) {
+ const u64 offset = imm9.SignExtend<u64>();
+ const auto acctype = IR::AccType::UNPRIV;
+
+ IR::MemOp memop;
+ bool is_signed;
+ size_t regsize;
+ if (opc.Bit<1>() == 0) {
+ // store or zero-extending load
+ memop = opc.Bit<0>() ? IR::MemOp::LOAD : IR::MemOp::STORE;
+ regsize = 32;
+ is_signed = false;
+ } else {
+ // sign-extending load
+ memop = IR::MemOp::LOAD;
+ regsize = opc.Bit<0>() ? 32 : 64;
+ is_signed = true;
+ }
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check Stack Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+ address = v.ir.Add(address, v.ir.Imm64(offset));
+
+ switch (memop) {
+ case IR::MemOp::STORE:
+ v.Mem(address, datasize / 8, acctype, v.X(datasize, Rt));
+ break;
+ case IR::MemOp::LOAD: {
+ const IR::UAny data = v.Mem(address, datasize / 8, acctype);
+ if (is_signed) {
+ v.X(regsize, Rt, v.SignExtend(data, regsize));
+ } else {
+ v.X(regsize, Rt, v.ZeroExtend(data, regsize));
+ }
+ break;
+ }
+ case IR::MemOp::PREFETCH:
+ // Prefetch(address, Rt);
+ break;
+ }
+ return true;
+}
+
+bool TranslatorVisitor::STTRB(Imm<9> imm9, Reg Rn, Reg Rt) {
+ return StoreRegister(*this, 8, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::STTRH(Imm<9> imm9, Reg Rn, Reg Rt) {
+ return StoreRegister(*this, 16, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::STTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt) {
+ const size_t scale = size.ZeroExtend<size_t>();
+ const size_t datasize = 8 << scale;
+ return StoreRegister(*this, datasize, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTRB(Imm<9> imm9, Reg Rn, Reg Rt) {
+ return LoadRegister(*this, 8, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTRH(Imm<9> imm9, Reg Rn, Reg Rt) {
+ return LoadRegister(*this, 16, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt) {
+ const size_t scale = size.ZeroExtend<size_t>();
+ const size_t datasize = 8 << scale;
+ return LoadRegister(*this, datasize, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTRSB(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) {
+ return LoadRegisterSigned(*this, 8, opc, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTRSH(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) {
+ return LoadRegisterSigned(*this, 16, opc, imm9, Rn, Rt);
+}
+
+bool TranslatorVisitor::LDTRSW(Imm<9> imm9, Reg Rn, Reg Rt) {
+ const u64 offset = imm9.SignExtend<u64>();
+ const auto acctype = IR::AccType::UNPRIV;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check Stack Alignment
+ address = SP(64);
+ } else {
+ address = X(64, Rn);
+ }
+ address = ir.Add(address, ir.Imm64(offset));
+
+ const IR::UAny data = Mem(address, 4, acctype);
+ X(64, Rt, SignExtend(data, 64));
+ return true;
+}
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp
new file mode 100644
index 0000000000..b4bc842942
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp
@@ -0,0 +1,224 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp memop, bool Q, bool S, bool R, bool replicate, std::optional<Reg> Rm, Imm<3> opcode, Imm<2> size, Reg Rn, Vec Vt) {
+ const size_t selem = (opcode.Bit<0>() << 1 | u32{R}) + 1;
+ size_t scale = opcode.Bits<1, 2>();
+ size_t index = 0;
+
+ switch (scale) {
+ case 0:
+ index = Q << 3 | S << 2 | size.ZeroExtend();
+ break;
+ case 1:
+ if (size.Bit<0>()) {
+ return v.UnallocatedEncoding();
+ }
+ index = Q << 2 | S << 1 | u32{size.Bit<1>()};
+ break;
+ case 2:
+ if (size.Bit<1>()) {
+ return v.UnallocatedEncoding();
+ }
+ if (size.Bit<0>()) {
+ if (S) {
+ return v.UnallocatedEncoding();
+ }
+ index = Q;
+ scale = 3;
+ } else {
+ index = Q << 1 | u32{S};
+ }
+ break;
+ case 3:
+ if (memop == IR::MemOp::STORE || S) {
+ return v.UnallocatedEncoding();
+ }
+ scale = size.ZeroExtend();
+ break;
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << scale;
+ const size_t ebytes = esize / 8;
+
+ IR::U64 address;
+ if (Rn == Reg::SP) {
+ // TODO: Check SP Alignment
+ address = v.SP(64);
+ } else {
+ address = v.X(64, Rn);
+ }
+
+ IR::U64 offs = v.ir.Imm64(0);
+ if (replicate) {
+ for (size_t s = 0; s < selem; s++) {
+ const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
+ const IR::UAnyU128 element = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
+ const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, element);
+
+ v.V(datasize, tt, broadcasted_element);
+
+ offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
+ }
+ } else {
+ for (size_t s = 0; s < selem; s++) {
+ const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
+ const IR::U128 rval = v.V(128, tt);
+
+ if (memop == IR::MemOp::LOAD) {
+ const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
+ const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, elem);
+ v.V(128, tt, vec);
+ } else {
+ const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index);
+ v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem);
+ }
+ offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
+ }
+ }
+
+ if (wback) {
+ if (*Rm != Reg::SP) {
+ offs = v.X(64, *Rm);
+ }
+
+ if (Rn == Reg::SP) {
+ v.SP(64, v.ir.Add(address, offs));
+ } else {
+ v.X(64, Rn, v.ir.Add(address, offs));
+ }
+ }
+
+ return true;
+}
+
+bool TranslatorVisitor::LD1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, false, false, {},
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, false, false, Rm,
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD1R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, false, true,
+ {}, Imm<3>{0b110}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD1R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, false, true,
+ Rm, Imm<3>{0b110}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, true, false, {},
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, true, false, Rm,
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD2R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, true, true,
+ {}, Imm<3>{0b110}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD2R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, true, true,
+ Rm, Imm<3>{0b110}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, false, false, {},
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, false, false, Rm,
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD3R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, false, true,
+ {}, Imm<3>{0b111}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD3R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, false, true,
+ Rm, Imm<3>{0b111}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, true, false, {},
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, true, false, Rm,
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD4R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, true, true,
+ {}, Imm<3>{0b111}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::LD4R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, true, true,
+ Rm, Imm<3>{0b111}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, false, false, {},
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, false, false, Rm,
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, true, false, {},
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, true, false, Rm,
+ Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, false, false, {},
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, false, false, Rm,
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, true, false, {},
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+bool TranslatorVisitor::ST4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) {
+ return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, true, false, Rm,
+ Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp
new file mode 100644
index 0000000000..7bd2e6e356
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp
@@ -0,0 +1,59 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::MOVN(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) {
+ if (!sf && hw.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const size_t pos = hw.ZeroExtend<size_t>() << 4;
+
+ u64 value = imm16.ZeroExtend<u64>() << pos;
+ value = ~value;
+
+ const auto result = I(datasize, value);
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MOVZ(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) {
+ if (!sf && hw.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const size_t pos = hw.ZeroExtend<size_t>() << 4;
+
+ const u64 value = imm16.ZeroExtend<u64>() << pos;
+ const auto result = I(datasize, value);
+
+ X(datasize, Rd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MOVK(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) {
+ if (!sf && hw.Bit<1>()) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = sf ? 64 : 32;
+ const size_t pos = hw.ZeroExtend<size_t>() << 4;
+
+ const u64 mask = u64(0xFFFF) << pos;
+ const u64 value = imm16.ZeroExtend<u64>() << pos;
+
+ auto result = X(datasize, Rd);
+ result = ir.And(result, I(datasize, ~mask));
+ result = ir.Or(result, I(datasize, value));
+ X(datasize, Rd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp
new file mode 100644
index 0000000000..6147a54a02
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp
@@ -0,0 +1,220 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool LongAdd(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, Signedness sign) {
+ if ((size == 0b10 && !Q) || size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const size_t elements = datasize / esize;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+
+ const auto get_element = [&](IR::U128 vec, size_t element) {
+ const auto vec_element = v.ir.VectorGetElement(esize, vec, element);
+
+ if (sign == Signedness::Signed) {
+ return v.ir.SignExtendToLong(vec_element);
+ }
+
+ return v.ir.ZeroExtendToLong(vec_element);
+ };
+
+ IR::U64 sum = get_element(operand, 0);
+ for (size_t i = 1; i < elements; i++) {
+ sum = v.ir.Add(sum, get_element(operand, i));
+ }
+
+ if (size == 0b00) {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantHalf(sum)));
+ } else if (size == 0b01) {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantWord(sum)));
+ } else {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(sum));
+ }
+
+ return true;
+}
+
+enum class MinMaxOperation {
+ Max,
+ MaxNumeric,
+ Min,
+ MinNumeric,
+};
+
+bool FPMinMax(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, MinMaxOperation operation) {
+ if (!Q || sz) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 32;
+ const size_t datasize = 128;
+ const size_t elements = datasize / esize;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+
+ const auto op = [&](const IR::U32U64& lhs, const IR::U32U64& rhs) {
+ switch (operation) {
+ case MinMaxOperation::Max:
+ return v.ir.FPMax(lhs, rhs);
+ case MinMaxOperation::MaxNumeric:
+ return v.ir.FPMaxNumeric(lhs, rhs);
+ case MinMaxOperation::Min:
+ return v.ir.FPMin(lhs, rhs);
+ case MinMaxOperation::MinNumeric:
+ return v.ir.FPMinNumeric(lhs, rhs);
+ default:
+ UNREACHABLE();
+ }
+ };
+
+ const auto reduce = [&](size_t start, size_t end) {
+ IR::U32U64 result = v.ir.VectorGetElement(esize, operand, start);
+
+ for (size_t i = start + 1; i < end; i++) {
+ const IR::U32U64 element = v.ir.VectorGetElement(esize, operand, i);
+
+ result = op(result, element);
+ }
+
+ return result;
+ };
+
+ const IR::U32U64 hi = reduce(elements / 2, elements);
+ const IR::U32U64 lo = reduce(0, elements / 2);
+ const IR::U32U64 result = op(lo, hi);
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+enum class ScalarMinMaxOperation {
+ Max,
+ Min,
+};
+
+bool ScalarMinMax(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, ScalarMinMaxOperation operation, Signedness sign) {
+ if ((size == 0b10 && !Q) || size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const size_t elements = datasize / esize;
+
+ const auto get_element = [&](IR::U128 vec, size_t element) {
+ const auto vec_element = v.ir.VectorGetElement(esize, vec, element);
+
+ if (sign == Signedness::Signed) {
+ return v.ir.SignExtendToWord(vec_element);
+ }
+
+ return v.ir.ZeroExtendToWord(vec_element);
+ };
+
+ const auto op_func = [&](const auto& a, const auto& b) {
+ switch (operation) {
+ case ScalarMinMaxOperation::Max:
+ if (sign == Signedness::Signed) {
+ return v.ir.MaxSigned(a, b);
+ }
+ return v.ir.MaxUnsigned(a, b);
+
+ case ScalarMinMaxOperation::Min:
+ if (sign == Signedness::Signed) {
+ return v.ir.MinSigned(a, b);
+ }
+ return v.ir.MinUnsigned(a, b);
+
+ default:
+ UNREACHABLE();
+ }
+ };
+
+ const IR::U128 operand = v.V(datasize, Vn);
+
+ IR::U32 value = get_element(operand, 0);
+ for (size_t i = 1; i < elements; i++) {
+ value = op_func(value, get_element(operand, i));
+ }
+
+ if (size == 0b00) {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantByte(value)));
+ } else if (size == 0b01) {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantHalf(value)));
+ } else {
+ v.V(datasize, Vd, v.ir.ZeroExtendToQuad(value));
+ }
+
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if ((size == 0b10 && !Q) || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+
+ V(128, Vd, ir.VectorReduceAdd(esize, operand));
+ return true;
+}
+
+bool TranslatorVisitor::FMAXNMV_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::MaxNumeric);
+}
+
+bool TranslatorVisitor::FMAXV_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::Max);
+}
+
+bool TranslatorVisitor::FMINNMV_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::MinNumeric);
+}
+
+bool TranslatorVisitor::FMINV_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::Min);
+}
+
+bool TranslatorVisitor::SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return LongAdd(*this, Q, size, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Max, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Min, Signedness::Signed);
+}
+
+bool TranslatorVisitor::UADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return LongAdd(*this, Q, size, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Max, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Min, Signedness::Unsigned);
+}
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp
new file mode 100644
index 0000000000..d604703ef2
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp
@@ -0,0 +1,46 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::AESD(Vec Vn, Vec Vd) {
+ const IR::U128 operand1 = ir.GetQ(Vd);
+ const IR::U128 operand2 = ir.GetQ(Vn);
+
+ const IR::U128 result = ir.AESDecryptSingleRound(ir.VectorEor(operand1, operand2));
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::AESE(Vec Vn, Vec Vd) {
+ const IR::U128 operand1 = ir.GetQ(Vd);
+ const IR::U128 operand2 = ir.GetQ(Vn);
+
+ const IR::U128 result = ir.AESEncryptSingleRound(ir.VectorEor(operand1, operand2));
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::AESIMC(Vec Vn, Vec Vd) {
+ const IR::U128 operand = ir.GetQ(Vn);
+ const IR::U128 result = ir.AESInverseMixColumns(operand);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::AESMC(Vec Vn, Vec Vd) {
+ const IR::U128 operand = ir.GetQ(Vn);
+ const IR::U128 result = ir.AESMixColumns(operand);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp
new file mode 100644
index 0000000000..b33bc8f5ad
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp
@@ -0,0 +1,158 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::DUP_elt_1(Imm<5> imm5, Vec Vn, Vec Vd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t idxdsize = imm5.Bit<4>() ? 128 : 64;
+ const size_t esize = 8 << size;
+
+ const IR::U128 operand = V(idxdsize, Vn);
+ const IR::UAny element = ir.VectorGetElement(esize, operand, index);
+ const IR::U128 result = ir.ZeroExtendToQuad(element);
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::DUP_elt_2(bool Q, Imm<5> imm5, Vec Vn, Vec Vd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ if (size == 3 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t idxdsize = imm5.Bit<4>() ? 128 : 64;
+ const size_t esize = 8 << size;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(idxdsize, Vn);
+ const IR::U128 result = Q ? ir.VectorBroadcastElement(esize, operand, index) : ir.VectorBroadcastElementLower(esize, operand, index);
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::DUP_gen(bool Q, Imm<5> imm5, Reg Rn, Vec Vd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ if (size == 3 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::UAny element = X(esize, Rn);
+
+ const IR::U128 result = Q ? ir.VectorBroadcast(esize, element) : ir.VectorBroadcastLower(esize, element);
+
+ V(datasize, Vd, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::SMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size == 2 && !Q) {
+ return UnallocatedEncoding();
+ }
+
+ if (size > 2) {
+ return ReservedValue();
+ }
+
+ const size_t idxdsize = imm5.Bit<4>() ? 128 : 64;
+ const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t esize = 8 << size;
+ const size_t datasize = Q ? 64 : 32;
+
+ const IR::U128 operand = V(idxdsize, Vn);
+
+ const IR::UAny elem = ir.VectorGetElement(esize, operand, index);
+ X(datasize, Rd, SignExtend(elem, datasize));
+
+ return true;
+}
+
+bool TranslatorVisitor::UMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size < 3 && Q) {
+ return UnallocatedEncoding();
+ }
+
+ if (size == 3 && !Q) {
+ return UnallocatedEncoding();
+ }
+
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ const size_t idxdsize = imm5.Bit<4>() ? 128 : 64;
+ const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t esize = 8 << size;
+ const size_t datasize = Q ? 64 : 32;
+
+ const IR::U128 operand = V(idxdsize, Vn);
+
+ const IR::UAny elem = ir.VectorGetElement(esize, operand, index);
+ X(datasize, Rd, ZeroExtend(elem, datasize));
+
+ return true;
+}
+
+bool TranslatorVisitor::INS_gen(Imm<5> imm5, Reg Rn, Vec Vd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t esize = 8 << size;
+ const size_t datasize = 128;
+
+ const IR::UAny element = X(esize, Rn);
+ const IR::U128 result = ir.VectorSetElement(esize, V(datasize, Vd), index, element);
+ V(datasize, Vd, result);
+
+ return true;
+}
+
+bool TranslatorVisitor::INS_elt(Imm<5> imm5, Imm<4> imm4, Vec Vn, Vec Vd) {
+ const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend());
+ if (size > 3) {
+ return ReservedValue();
+ }
+
+ const size_t dst_index = imm5.ZeroExtend<size_t>() >> (size + 1);
+ const size_t src_index = imm4.ZeroExtend<size_t>() >> size;
+ const size_t idxdsize = imm4.Bit<3>() ? 128 : 64;
+ const size_t esize = 8 << size;
+
+ const IR::U128 operand = V(idxdsize, Vn);
+ const IR::UAny elem = ir.VectorGetElement(esize, operand, src_index);
+ const IR::U128 result = ir.VectorSetElement(esize, V(128, Vd), dst_index, elem);
+ V(128, Vd, result);
+
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp
new file mode 100644
index 0000000000..8464bba5d7
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp
@@ -0,0 +1,52 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::EOR3(Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const IR::U128 a = ir.GetQ(Va);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 result = ir.VectorEor(ir.VectorEor(n, m), a);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BCAX(Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const IR::U128 a = ir.GetQ(Va);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 result = ir.VectorEor(n, ir.VectorAndNot(m, a));
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SM3SS1(Vec Vm, Vec Va, Vec Vn, Vec Vd) {
+ const IR::U128 a = ir.GetQ(Va);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U32 top_a = ir.VectorGetElement(32, a, 3);
+ const IR::U32 top_m = ir.VectorGetElement(32, m, 3);
+ const IR::U32 top_n = ir.VectorGetElement(32, n, 3);
+
+ const IR::U32 rotated_n = ir.RotateRight(top_n, ir.Imm8(20));
+ const IR::U32 sum = ir.Add(ir.Add(rotated_n, top_m), top_a);
+ const IR::U32 result = ir.RotateRight(sum, ir.Imm8(25));
+
+ const IR::U128 zero_vector = ir.ZeroVector();
+ const IR::U128 vector_result = ir.VectorSetElement(32, zero_vector, 3, result);
+
+ ir.SetQ(Vd, vector_result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp
new file mode 100644
index 0000000000..f21570f90e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp
@@ -0,0 +1,102 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class SM3TTVariant {
+ A,
+ B,
+};
+
+bool SM3TT1(TranslatorVisitor& v, Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd, SM3TTVariant behavior) {
+ const IR::U128 d = v.ir.GetQ(Vd);
+ const IR::U128 m = v.ir.GetQ(Vm);
+ const IR::U128 n = v.ir.GetQ(Vn);
+ const u32 index = imm2.ZeroExtend();
+
+ const IR::U32 top_d = v.ir.VectorGetElement(32, d, 3);
+ const IR::U32 before_top_d = v.ir.VectorGetElement(32, d, 2);
+ const IR::U32 after_low_d = v.ir.VectorGetElement(32, d, 1);
+ const IR::U32 low_d = v.ir.VectorGetElement(32, d, 0);
+ const IR::U32 top_n = v.ir.VectorGetElement(32, n, 3);
+
+ const IR::U32 wj_prime = v.ir.VectorGetElement(32, m, index);
+ const IR::U32 ss2 = v.ir.Eor(top_n, v.ir.RotateRight(top_d, v.ir.Imm8(20)));
+ const IR::U32 tt1 = [&] {
+ if (behavior == SM3TTVariant::A) {
+ return v.ir.Eor(after_low_d, v.ir.Eor(top_d, before_top_d));
+ }
+ const IR::U32 tmp1 = v.ir.And(top_d, after_low_d);
+ const IR::U32 tmp2 = v.ir.And(top_d, before_top_d);
+ const IR::U32 tmp3 = v.ir.And(after_low_d, before_top_d);
+ return v.ir.Or(v.ir.Or(tmp1, tmp2), tmp3);
+ }();
+ const IR::U32 final_tt1 = v.ir.Add(tt1, v.ir.Add(low_d, v.ir.Add(ss2, wj_prime)));
+
+ const IR::U128 zero_vector = v.ir.ZeroVector();
+ const IR::U128 tmp1 = v.ir.VectorSetElement(32, zero_vector, 0, after_low_d);
+ const IR::U128 tmp2 = v.ir.VectorSetElement(32, tmp1, 1, v.ir.RotateRight(before_top_d, v.ir.Imm8(23)));
+ const IR::U128 tmp3 = v.ir.VectorSetElement(32, tmp2, 2, top_d);
+ const IR::U128 result = v.ir.VectorSetElement(32, tmp3, 3, final_tt1);
+
+ v.ir.SetQ(Vd, result);
+ return true;
+}
+
+bool SM3TT2(TranslatorVisitor& v, Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd, SM3TTVariant behavior) {
+ const IR::U128 d = v.ir.GetQ(Vd);
+ const IR::U128 m = v.ir.GetQ(Vm);
+ const IR::U128 n = v.ir.GetQ(Vn);
+ const u32 index = imm2.ZeroExtend();
+
+ const IR::U32 top_d = v.ir.VectorGetElement(32, d, 3);
+ const IR::U32 before_top_d = v.ir.VectorGetElement(32, d, 2);
+ const IR::U32 after_low_d = v.ir.VectorGetElement(32, d, 1);
+ const IR::U32 low_d = v.ir.VectorGetElement(32, d, 0);
+ const IR::U32 top_n = v.ir.VectorGetElement(32, n, 3);
+
+ const IR::U32 wj = v.ir.VectorGetElement(32, m, index);
+ const IR::U32 tt2 = [&] {
+ if (behavior == SM3TTVariant::A) {
+ return v.ir.Eor(after_low_d, v.ir.Eor(top_d, before_top_d));
+ }
+ const IR::U32 tmp1 = v.ir.And(top_d, before_top_d);
+ const IR::U32 tmp2 = v.ir.AndNot(after_low_d, top_d);
+ return v.ir.Or(tmp1, tmp2);
+ }();
+ const IR::U32 final_tt2 = v.ir.Add(tt2, v.ir.Add(low_d, v.ir.Add(top_n, wj)));
+ const IR::U32 top_result = v.ir.Eor(final_tt2, v.ir.Eor(v.ir.RotateRight(final_tt2, v.ir.Imm8(23)),
+ v.ir.RotateRight(final_tt2, v.ir.Imm8(15))));
+
+ const IR::U128 zero_vector = v.ir.ZeroVector();
+ const IR::U128 tmp1 = v.ir.VectorSetElement(32, zero_vector, 0, after_low_d);
+ const IR::U128 tmp2 = v.ir.VectorSetElement(32, tmp1, 1, v.ir.RotateRight(before_top_d, v.ir.Imm8(13)));
+ const IR::U128 tmp3 = v.ir.VectorSetElement(32, tmp2, 2, top_d);
+ const IR::U128 result = v.ir.VectorSetElement(32, tmp3, 3, top_result);
+
+ v.ir.SetQ(Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::SM3TT1A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) {
+ return SM3TT1(*this, Vm, imm2, Vn, Vd, SM3TTVariant::A);
+}
+
+bool TranslatorVisitor::SM3TT1B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) {
+ return SM3TT1(*this, Vm, imm2, Vn, Vd, SM3TTVariant::B);
+}
+
+bool TranslatorVisitor::SM3TT2A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) {
+ return SM3TT2(*this, Vm, imm2, Vn, Vd, SM3TTVariant::A);
+}
+
+bool TranslatorVisitor::SM3TT2B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) {
+ return SM3TT2(*this, Vm, imm2, Vn, Vd, SM3TTVariant::B);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp
new file mode 100644
index 0000000000..bb5dcbe93e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp
@@ -0,0 +1,27 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::EXT(bool Q, Vec Vm, Imm<4> imm4, Vec Vn, Vec Vd) {
+ if (!Q && imm4.Bit<3>()) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t position = imm4.ZeroExtend<size_t>() << 3;
+
+ const IR::U128 lo = V(datasize, Vn);
+ const IR::U128 hi = V(datasize, Vm);
+ const IR::U128 result = datasize == 64 ? ir.VectorExtractLower(lo, hi, position) : ir.VectorExtract(lo, hi, position);
+
+ V(datasize, Vd, result);
+
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp
new file mode 100644
index 0000000000..db8e83631a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp
@@ -0,0 +1,130 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::MOVI(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<4> cmode, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ // MOVI
+ // also FMOV (vector, immediate) when cmode == 0b1111
+ const auto movi = [&] {
+ const u64 imm64 = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h));
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ V(128, Vd, imm);
+ return true;
+ };
+
+ // MVNI
+ const auto mvni = [&] {
+ const u64 imm64 = ~AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h));
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ V(128, Vd, imm);
+ return true;
+ };
+
+ // ORR (vector, immediate)
+ const auto orr = [&] {
+ const u64 imm64 = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h));
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ const IR::U128 operand = V(datasize, Vd);
+ const IR::U128 result = ir.VectorOr(operand, imm);
+ V(datasize, Vd, result);
+ return true;
+ };
+
+ // BIC (vector, immediate)
+ const auto bic = [&] {
+ const u64 imm64 = ~AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h));
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ const IR::U128 operand = V(datasize, Vd);
+ const IR::U128 result = ir.VectorAnd(operand, imm);
+ V(datasize, Vd, result);
+ return true;
+ };
+
+ switch (concatenate(cmode, Imm<1>{op}).ZeroExtend()) {
+ case 0b00000:
+ case 0b00100:
+ case 0b01000:
+ case 0b01100:
+ case 0b10000:
+ case 0b10100:
+ case 0b11000:
+ case 0b11010:
+ case 0b11100:
+ case 0b11101:
+ case 0b11110:
+ return movi();
+ case 0b11111:
+ if (!Q) {
+ return UnallocatedEncoding();
+ }
+ return movi();
+ case 0b00001:
+ case 0b00101:
+ case 0b01001:
+ case 0b01101:
+ case 0b10001:
+ case 0b10101:
+ case 0b11001:
+ case 0b11011:
+ return mvni();
+ case 0b00010:
+ case 0b00110:
+ case 0b01010:
+ case 0b01110:
+ case 0b10010:
+ case 0b10110:
+ return orr();
+ case 0b00011:
+ case 0b00111:
+ case 0b01011:
+ case 0b01111:
+ case 0b10011:
+ case 0b10111:
+ return bic();
+ }
+
+ UNREACHABLE();
+}
+
+bool TranslatorVisitor::FMOV_2(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ if (op && !Q) {
+ return UnallocatedEncoding();
+ }
+
+ const u64 imm64 = AdvSIMDExpandImm(op, Imm<4>{0b1111}, concatenate(a, b, c, d, e, f, g, h));
+
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ V(128, Vd, imm);
+ return true;
+}
+
+bool TranslatorVisitor::FMOV_3(bool Q, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const Imm<8> imm8 = concatenate(a, b, c, d, e, f, g, h);
+ const u16 imm16 = [&imm8] {
+ u16 imm16 = 0;
+ imm16 |= imm8.Bit<7>() ? 0x8000 : 0;
+ imm16 |= imm8.Bit<6>() ? 0x3000 : 0x4000;
+ imm16 |= imm8.Bits<0, 5, u16>() << 6;
+ return imm16;
+ }();
+ const u64 imm64 = mcl::bit::replicate_element<u16, u64>(imm16);
+
+ const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64));
+ V(128, Vd, imm);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp
new file mode 100644
index 0000000000..06e15273cf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp
@@ -0,0 +1,113 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <tuple>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class Transposition {
+ TRN1,
+ TRN2,
+};
+
+bool VectorTranspose(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Transposition type) {
+ if (!Q && size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const u8 esize = static_cast<u8>(8 << size.ZeroExtend());
+
+ const IR::U128 m = v.V(datasize, Vm);
+ const IR::U128 n = v.V(datasize, Vn);
+ const IR::U128 result = v.ir.VectorTranspose(esize, n, m, type == Transposition::TRN2);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class UnzipType {
+ Even,
+ Odd,
+};
+
+bool VectorUnzip(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, UnzipType type) {
+ if (size == 0b11 && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 n = v.V(datasize, Vn);
+ const IR::U128 m = v.V(datasize, Vm);
+ const IR::U128 result = type == UnzipType::Even
+ ? (Q ? v.ir.VectorDeinterleaveEven(esize, n, m) : v.ir.VectorDeinterleaveEvenLower(esize, n, m))
+ : (Q ? v.ir.VectorDeinterleaveOdd(esize, n, m) : v.ir.VectorDeinterleaveOddLower(esize, n, m));
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::TRN1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorTranspose(*this, Q, size, Vm, Vn, Vd, Transposition::TRN1);
+}
+
+bool TranslatorVisitor::TRN2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorTranspose(*this, Q, size, Vm, Vn, Vd, Transposition::TRN2);
+}
+
+bool TranslatorVisitor::UZP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorUnzip(*this, Q, size, Vm, Vn, Vd, UnzipType::Even);
+}
+
+bool TranslatorVisitor::UZP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorUnzip(*this, Q, size, Vm, Vn, Vd, UnzipType::Odd);
+}
+
+bool TranslatorVisitor::ZIP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorInterleaveLower(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ZIP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = [&] {
+ if (Q) {
+ return ir.VectorInterleaveUpper(esize, operand1, operand2);
+ }
+
+ // TODO: Urgh.
+ const IR::U128 interleaved = ir.VectorInterleaveLower(esize, operand1, operand2);
+ return ir.VectorZeroUpper(ir.VectorRotateWholeVectorRight(interleaved, 64));
+ }();
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp
new file mode 100644
index 0000000000..63615b0f9a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp
@@ -0,0 +1,80 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class MinMaxOperation {
+ Max,
+ MaxNumeric,
+ Min,
+ MinNumeric,
+};
+
+bool FPPairwiseMinMax(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, MinMaxOperation operation) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = v.V(128, Vn);
+ const IR::U32U64 element1 = v.ir.VectorGetElement(esize, operand, 0);
+ const IR::U32U64 element2 = v.ir.VectorGetElement(esize, operand, 1);
+ const IR::U32U64 result = [&] {
+ switch (operation) {
+ case MinMaxOperation::Max:
+ return v.ir.FPMax(element1, element2);
+ case MinMaxOperation::MaxNumeric:
+ return v.ir.FPMaxNumeric(element1, element2);
+ case MinMaxOperation::Min:
+ return v.ir.FPMin(element1, element2);
+ case MinMaxOperation::MinNumeric:
+ return v.ir.FPMinNumeric(element1, element2);
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ v.V(128, Vd, v.ir.ZeroExtendToQuad(result));
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::ADDP_pair(Imm<2> size, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U64 operand1 = ir.VectorGetElement(64, V(128, Vn), 0);
+ const IR::U64 operand2 = ir.VectorGetElement(64, V(128, Vn), 1);
+ const IR::U128 result = ir.ZeroExtendToQuad(ir.Add(operand1, operand2));
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FADDP_pair_2(bool size, Vec Vn, Vec Vd) {
+ const size_t esize = size ? 64 : 32;
+
+ const IR::U32U64 operand1 = ir.VectorGetElement(esize, V(128, Vn), 0);
+ const IR::U32U64 operand2 = ir.VectorGetElement(esize, V(128, Vn), 1);
+ const IR::U128 result = ir.ZeroExtendToQuad(ir.FPAdd(operand1, operand2));
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMAXNMP_pair_2(bool sz, Vec Vn, Vec Vd) {
+ return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::MaxNumeric);
+}
+
+bool TranslatorVisitor::FMAXP_pair_2(bool sz, Vec Vn, Vec Vd) {
+ return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::Max);
+}
+
+bool TranslatorVisitor::FMINNMP_pair_2(bool sz, Vec Vn, Vec Vd) {
+ return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::MinNumeric);
+}
+
+bool TranslatorVisitor::FMINP_pair_2(bool sz, Vec Vn, Vec Vd) {
+ return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::Min);
+}
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp
new file mode 100644
index 0000000000..a0570edc7e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp
@@ -0,0 +1,354 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class Narrowing {
+ Truncation,
+ SaturateToUnsigned,
+ SaturateToSigned,
+};
+
+enum class SaturatingShiftLeftType {
+ Signed,
+ Unsigned,
+ SignedWithUnsignedSaturation,
+};
+
+enum class ShiftExtraBehavior {
+ None,
+ Accumulate,
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned,
+};
+
+enum class FloatConversionDirection {
+ FixedToFloat,
+ FloatToFixed,
+};
+
+bool SaturatingShiftLeft(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, SaturatingShiftLeftType type) {
+ if (immh == 0b0000) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8U << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t shift_amount = concatenate(immh, immb).ZeroExtend() - esize;
+
+ const IR::U128 operand = v.ir.ZeroExtendToQuad(v.V_scalar(esize, Vn));
+ const IR::U128 shift = v.ir.ZeroExtendToQuad(v.I(esize, shift_amount));
+ const IR::U128 result = [&v, esize, operand, shift, type, shift_amount] {
+ if (type == SaturatingShiftLeftType::Signed) {
+ return v.ir.VectorSignedSaturatedShiftLeft(esize, operand, shift);
+ }
+
+ if (type == SaturatingShiftLeftType::Unsigned) {
+ return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand, shift);
+ }
+
+ return v.ir.VectorSignedSaturatedShiftLeftUnsigned(esize, operand, static_cast<u8>(shift_amount));
+ }();
+
+ v.ir.SetQ(Vd, result);
+ return true;
+}
+
+bool ShiftRight(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftExtraBehavior behavior, Signedness signedness) {
+ if (!immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 64;
+ const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend());
+
+ const IR::U64 operand = v.V_scalar(esize, Vn);
+ IR::U64 result = [&]() -> IR::U64 {
+ if (signedness == Signedness::Signed) {
+ return v.ir.ArithmeticShiftRight(operand, v.ir.Imm8(shift_amount));
+ }
+ return v.ir.LogicalShiftRight(operand, v.ir.Imm8(shift_amount));
+ }();
+
+ if (behavior == ShiftExtraBehavior::Accumulate) {
+ const IR::U64 addend = v.V_scalar(esize, Vd);
+ result = v.ir.Add(result, addend);
+ }
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool RoundingShiftRight(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftExtraBehavior behavior, Signedness signedness) {
+ if (!immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 64;
+ const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend());
+
+ const IR::U64 operand = v.V_scalar(esize, Vn);
+ const IR::U64 round_bit = v.ir.LogicalShiftRight(v.ir.LogicalShiftLeft(operand, v.ir.Imm8(64 - shift_amount)), v.ir.Imm8(63));
+ const IR::U64 result = [&] {
+ const IR::U64 shifted = [&]() -> IR::U64 {
+ if (signedness == Signedness::Signed) {
+ return v.ir.ArithmeticShiftRight(operand, v.ir.Imm8(shift_amount));
+ }
+ return v.ir.LogicalShiftRight(operand, v.ir.Imm8(shift_amount));
+ }();
+
+ IR::U64 tmp = v.ir.Add(shifted, round_bit);
+
+ if (behavior == ShiftExtraBehavior::Accumulate) {
+ tmp = v.ir.Add(tmp, v.V_scalar(esize, Vd));
+ }
+
+ return tmp;
+ }();
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+enum class ShiftDirection {
+ Left,
+ Right,
+};
+
+bool ShiftAndInsert(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftDirection direction) {
+ if (!immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 64;
+
+ const u8 shift_amount = [&] {
+ if (direction == ShiftDirection::Right) {
+ return static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend());
+ }
+
+ return static_cast<u8>(concatenate(immh, immb).ZeroExtend() - esize);
+ }();
+
+ const u64 mask = [&] {
+ if (direction == ShiftDirection::Right) {
+ return shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount;
+ }
+
+ return mcl::bit::ones<u64>(esize) << shift_amount;
+ }();
+
+ const IR::U64 operand1 = v.V_scalar(esize, Vn);
+ const IR::U64 operand2 = v.V_scalar(esize, Vd);
+
+ const IR::U64 shifted = [&] {
+ if (direction == ShiftDirection::Right) {
+ return v.ir.LogicalShiftRight(operand1, v.ir.Imm8(shift_amount));
+ }
+
+ return v.ir.LogicalShiftLeft(operand1, v.ir.Imm8(shift_amount));
+ }();
+
+ const IR::U64 result = v.ir.Or(v.ir.AndNot(operand2, v.ir.Imm64(mask)), shifted);
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool ShiftRightNarrowing(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Narrowing narrowing, Signedness signedness) {
+ if (immh == 0b0000) {
+ return v.ReservedValue();
+ }
+
+ if (immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t source_esize = 2 * esize;
+ const u8 shift_amount = static_cast<u8>(source_esize - concatenate(immh, immb).ZeroExtend());
+
+ const IR::U128 operand = v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(source_esize, v.V(128, Vn), 0));
+
+ IR::U128 wide_result = [&] {
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorArithmeticShiftRight(source_esize, operand, shift_amount);
+ }
+ return v.ir.VectorLogicalShiftRight(source_esize, operand, shift_amount);
+ }();
+
+ const IR::U128 result = [&] {
+ switch (narrowing) {
+ case Narrowing::Truncation:
+ return v.ir.VectorNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToUnsigned:
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result);
+ }
+ return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToSigned:
+ ASSERT(signedness == Signedness::Signed);
+ return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result);
+ }
+ UNREACHABLE();
+ }();
+
+ const IR::UAny segment = v.ir.VectorGetElement(esize, result, 0);
+ v.V_scalar(esize, Vd, segment);
+ return true;
+}
+
+bool ScalarFPConvertWithRound(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness sign, FloatConversionDirection direction, FP::RoundingMode rounding_mode) {
+ const u32 immh_value = immh.ZeroExtend();
+
+ if ((immh_value & 0b1110) == 0b0000) {
+ return v.ReservedValue();
+ }
+
+ // TODO: We currently don't handle FP16, so bail like the ARM reference manual allows.
+ if ((immh_value & 0b1110) == 0b0010) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = (immh_value & 0b1000) != 0 ? 64 : 32;
+ const size_t concat = concatenate(immh, immb).ZeroExtend();
+ const size_t fbits = (esize * 2) - concat;
+
+ const IR::U32U64 operand = v.V_scalar(esize, Vn);
+ const IR::U32U64 result = [&]() -> IR::U32U64 {
+ switch (direction) {
+ case FloatConversionDirection::FloatToFixed:
+ if (esize == 64) {
+ return sign == Signedness::Signed
+ ? v.ir.FPToFixedS64(operand, fbits, rounding_mode)
+ : v.ir.FPToFixedU64(operand, fbits, rounding_mode);
+ }
+
+ return sign == Signedness::Signed
+ ? v.ir.FPToFixedS32(operand, fbits, rounding_mode)
+ : v.ir.FPToFixedU32(operand, fbits, rounding_mode);
+
+ case FloatConversionDirection::FixedToFloat:
+ if (esize == 64) {
+ return sign == Signedness::Signed
+ ? v.ir.FPSignedFixedToDouble(operand, fbits, rounding_mode)
+ : v.ir.FPUnsignedFixedToDouble(operand, fbits, rounding_mode);
+ }
+
+ return sign == Signedness::Signed
+ ? v.ir.FPSignedFixedToSingle(operand, fbits, rounding_mode)
+ : v.ir.FPUnsignedFixedToSingle(operand, fbits, rounding_mode);
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::FCVTZS_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FCVTZU_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::SCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode());
+}
+
+bool TranslatorVisitor::UCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode());
+}
+
+bool TranslatorVisitor::SLI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftAndInsert(*this, immh, immb, Vn, Vd, ShiftDirection::Left);
+}
+
+bool TranslatorVisitor::SRI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftAndInsert(*this, immh, immb, Vn, Vd, ShiftDirection::Right);
+}
+
+bool TranslatorVisitor::SQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::Signed);
+}
+
+bool TranslatorVisitor::SQSHLU_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::SignedWithUnsignedSaturation);
+}
+
+bool TranslatorVisitor::SQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToSigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToUnsigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SHL_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ if (!immh.Bit<3>()) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 64;
+ const u8 shift_amount = static_cast<u8>(concatenate(immh, immb).ZeroExtend() - esize);
+
+ const IR::U64 operand = V_scalar(esize, Vn);
+ const IR::U64 result = ir.LogicalShiftLeft(operand, ir.Imm8(shift_amount));
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::Unsigned);
+}
+
+bool TranslatorVisitor::UQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToUnsigned, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::URSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::URSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Unsigned);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp
new file mode 100644
index 0000000000..fb9ae9d141
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp
@@ -0,0 +1,430 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class ComparisonType {
+ EQ,
+ GE,
+ GT,
+ HI,
+ HS,
+ LE,
+ LT,
+};
+
+enum class ComparisonVariant {
+ Register,
+ Zero,
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned,
+};
+
+bool RoundingShiftLeft(TranslatorVisitor& v, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) {
+ if (size != 0b11) {
+ return v.ReservedValue();
+ }
+
+ const IR::U128 operand1 = v.V(64, Vn);
+ const IR::U128 operand2 = v.V(64, Vm);
+ const IR::U128 result = [&] {
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorRoundingShiftLeftSigned(64, operand1, operand2);
+ }
+
+ return v.ir.VectorRoundingShiftLeftUnsigned(64, operand1, operand2);
+ }();
+
+ v.V(64, Vd, result);
+ return true;
+}
+
+bool ScalarCompare(TranslatorVisitor& v, Imm<2> size, std::optional<Vec> Vm, Vec Vn, Vec Vd, ComparisonType type, ComparisonVariant variant) {
+ if (size != 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 64;
+ const size_t datasize = 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = variant == ComparisonVariant::Register ? v.V(datasize, *Vm) : v.ir.ZeroVector();
+
+ const IR::U128 result = [&] {
+ switch (type) {
+ case ComparisonType::EQ:
+ return v.ir.VectorEqual(esize, operand1, operand2);
+ case ComparisonType::GE:
+ return v.ir.VectorGreaterEqualSigned(esize, operand1, operand2);
+ case ComparisonType::GT:
+ return v.ir.VectorGreaterSigned(esize, operand1, operand2);
+ case ComparisonType::HI:
+ return v.ir.VectorGreaterUnsigned(esize, operand1, operand2);
+ case ComparisonType::HS:
+ return v.ir.VectorGreaterEqualUnsigned(esize, operand1, operand2);
+ case ComparisonType::LE:
+ return v.ir.VectorLessEqualSigned(esize, operand1, operand2);
+ case ComparisonType::LT:
+ default:
+ return v.ir.VectorLessSigned(esize, operand1, operand2);
+ }
+ }();
+
+ v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0));
+ return true;
+}
+
+enum class FPComparisonType {
+ EQ,
+ GE,
+ AbsoluteGE,
+ GT,
+ AbsoluteGT
+};
+
+bool ScalarFPCompareRegister(TranslatorVisitor& v, bool sz, Vec Vm, Vec Vn, Vec Vd, FPComparisonType type) {
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = esize;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ switch (type) {
+ case FPComparisonType::EQ:
+ return v.ir.FPVectorEqual(esize, operand1, operand2);
+ case FPComparisonType::GE:
+ return v.ir.FPVectorGreaterEqual(esize, operand1, operand2);
+ case FPComparisonType::AbsoluteGE:
+ return v.ir.FPVectorGreaterEqual(esize,
+ v.ir.FPVectorAbs(esize, operand1),
+ v.ir.FPVectorAbs(esize, operand2));
+ case FPComparisonType::GT:
+ return v.ir.FPVectorGreater(esize, operand1, operand2);
+ case FPComparisonType::AbsoluteGT:
+ return v.ir.FPVectorGreater(esize,
+ v.ir.FPVectorAbs(esize, operand1),
+ v.ir.FPVectorAbs(esize, operand2));
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0));
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::SQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = V_scalar(esize, Vm);
+ const auto result = ir.SignedSaturatedAdd(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = V_scalar(esize, Vm);
+ const auto result = ir.SignedSaturatedDoublingMultiplyReturnHigh(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQRDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0));
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, operand2);
+
+ V_scalar(esize, Vd, ir.VectorGetElement(esize, result, 0));
+ return true;
+}
+
+bool TranslatorVisitor::SQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = V_scalar(esize, Vm);
+ const auto result = ir.SignedSaturatedSub(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = V_scalar(esize, Vm);
+ const auto result = ir.UnsignedSaturatedAdd(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = V_scalar(esize, Vm);
+ const auto result = ir.UnsignedSaturatedSub(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = esize;
+
+ const IR::U64 operand1 = V_scalar(datasize, Vn);
+ const IR::U64 operand2 = V_scalar(datasize, Vm);
+ const IR::U64 result = ir.Add(operand1, operand2);
+ V_scalar(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMEQ_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::EQ, ComparisonVariant::Register);
+}
+
+bool TranslatorVisitor::CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::EQ, ComparisonVariant::Zero);
+}
+
+bool TranslatorVisitor::CMGE_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::GE, ComparisonVariant::Register);
+}
+
+bool TranslatorVisitor::CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::GE, ComparisonVariant::Zero);
+}
+
+bool TranslatorVisitor::CMGT_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::GT, ComparisonVariant::Register);
+}
+
+bool TranslatorVisitor::CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::GT, ComparisonVariant::Zero);
+}
+
+bool TranslatorVisitor::CMLE_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::LE, ComparisonVariant::Zero);
+}
+
+bool TranslatorVisitor::CMLT_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::LT, ComparisonVariant::Zero);
+}
+
+bool TranslatorVisitor::CMHI_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::HI, ComparisonVariant::Register);
+}
+
+bool TranslatorVisitor::CMHS_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::HS, ComparisonVariant::Register);
+}
+
+bool TranslatorVisitor::CMTST_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U128 operand1 = V(64, Vn);
+ const IR::U128 operand2 = V(64, Vm);
+ const IR::U128 anded = ir.VectorAnd(operand1, operand2);
+ const IR::U128 result = ir.VectorNot(ir.VectorEqual(64, anded, ir.ZeroVector()));
+
+ V(64, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FABD_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand1 = V_scalar(esize, Vn);
+ const IR::U32U64 operand2 = V_scalar(esize, Vm);
+ const IR::U32U64 result = ir.FPAbs(ir.FPSub(operand1, operand2));
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMULX_vec_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand1 = V_scalar(esize, Vn);
+ const IR::U32U64 operand2 = V_scalar(esize, Vm);
+ const IR::U32U64 result = ir.FPMulX(operand1, operand2);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPS_1(Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+
+ const IR::U16 operand1 = V_scalar(esize, Vn);
+ const IR::U16 operand2 = V_scalar(esize, Vm);
+ const IR::U16 result = ir.FPRecipStepFused(operand1, operand2);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPS_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand1 = V_scalar(esize, Vn);
+ const IR::U32U64 operand2 = V_scalar(esize, Vm);
+ const IR::U32U64 result = ir.FPRecipStepFused(operand1, operand2);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTS_1(Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+
+ const IR::U16 operand1 = V_scalar(esize, Vn);
+ const IR::U16 operand2 = V_scalar(esize, Vm);
+ const IR::U16 result = ir.FPRSqrtStepFused(operand1, operand2);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTS_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand1 = V_scalar(esize, Vn);
+ const IR::U32U64 operand2 = V_scalar(esize, Vm);
+ const IR::U32U64 result = ir.FPRSqrtStepFused(operand1, operand2);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FACGE_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::AbsoluteGE);
+}
+
+bool TranslatorVisitor::FACGT_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::AbsoluteGT);
+}
+
+bool TranslatorVisitor::FCMEQ_reg_1(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 lhs = V(128, Vn);
+ const IR::U128 rhs = V(128, Vm);
+ const IR::U128 result = ir.FPVectorEqual(16, lhs, rhs);
+
+ V_scalar(16, Vd, ir.VectorGetElement(16, result, 0));
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::EQ);
+}
+
+bool TranslatorVisitor::FCMGE_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::GE);
+}
+
+bool TranslatorVisitor::FCMGT_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::GT);
+}
+
+bool TranslatorVisitor::SQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8U << size.ZeroExtend();
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0));
+ const IR::U128 result = ir.VectorSignedSaturatedShiftLeft(esize, operand1, operand2);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingShiftLeft(*this, size, Vm, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U128 operand1 = V(64, Vn);
+ const IR::U128 operand2 = V(64, Vm);
+ const IR::U128 result = ir.VectorArithmeticVShift(64, operand1, operand2);
+
+ V(64, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = esize;
+
+ const IR::U64 operand1 = V_scalar(datasize, Vn);
+ const IR::U64 operand2 = V_scalar(datasize, Vm);
+ const IR::U64 result = ir.Sub(operand1, operand2);
+ V_scalar(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 8U << size.ZeroExtend();
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0));
+ const IR::U128 result = ir.VectorUnsignedSaturatedShiftLeft(esize, operand1, operand2);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::URSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingShiftLeft(*this, size, Vm, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U128 operand1 = V(64, Vn);
+ const IR::U128 operand2 = V(64, Vm);
+ const IR::U128 result = ir.VectorLogicalVShift(64, operand1, operand2);
+
+ V(64, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp
new file mode 100644
index 0000000000..2289f5cbd1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp
@@ -0,0 +1,331 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class ComparisonType {
+ EQ,
+ GE,
+ GT,
+ LE,
+ LT
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool ScalarFPCompareAgainstZero(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, ComparisonType type) {
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = esize;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 zero = v.ir.ZeroVector();
+ const IR::U128 result = [&] {
+ switch (type) {
+ case ComparisonType::EQ:
+ return v.ir.FPVectorEqual(esize, operand, zero);
+ case ComparisonType::GE:
+ return v.ir.FPVectorGreaterEqual(esize, operand, zero);
+ case ComparisonType::GT:
+ return v.ir.FPVectorGreater(esize, operand, zero);
+ case ComparisonType::LE:
+ return v.ir.FPVectorGreaterEqual(esize, zero, operand);
+ case ComparisonType::LT:
+ return v.ir.FPVectorGreater(esize, zero, operand);
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0));
+ return true;
+}
+
+bool ScalarFPConvertWithRound(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, FP::RoundingMode rmode, Signedness sign) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand = v.V_scalar(esize, Vn);
+ const IR::U32U64 result = [&]() -> IR::U32U64 {
+ if (sz) {
+ return sign == Signedness::Signed
+ ? v.ir.FPToFixedS64(operand, 0, rmode)
+ : v.ir.FPToFixedU64(operand, 0, rmode);
+ }
+
+ return sign == Signedness::Signed
+ ? v.ir.FPToFixedS32(operand, 0, rmode)
+ : v.ir.FPToFixedU32(operand, 0, rmode);
+ }();
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+using NarrowingFn = IR::U128 (IR::IREmitter::*)(size_t, const IR::U128&);
+
+bool SaturatedNarrow(TranslatorVisitor& v, Imm<2> size, Vec Vn, Vec Vd, NarrowingFn fn) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand = v.ir.ZeroExtendToQuad(v.V_scalar(2 * esize, Vn));
+ const IR::U128 result = (v.ir.*fn)(2 * esize, operand);
+
+ v.V_scalar(64, Vd, v.ir.VectorGetElement(64, result, 0));
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::ABS_1(Imm<2> size, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U64 operand1 = V_scalar(64, Vn);
+ const IR::U64 operand2 = ir.ArithmeticShiftRight(operand1, ir.Imm8(63));
+ const IR::U64 result = ir.Sub(ir.Eor(operand1, operand2), operand2);
+
+ V_scalar(64, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_zero_1(Vec Vn, Vec Vd) {
+ const IR::U128 operand = ir.ZeroExtendToQuad(V_scalar(16, Vn));
+ const IR::U128 zero = ir.ZeroVector();
+ const IR::U128 result = ir.FPVectorEqual(16, operand, zero);
+
+ V_scalar(16, Vd, ir.VectorGetElement(16, result, 0));
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_zero_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::EQ);
+}
+
+bool TranslatorVisitor::FCMGE_zero_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::GE);
+}
+
+bool TranslatorVisitor::FCMGT_zero_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::GT);
+}
+
+bool TranslatorVisitor::FCMLE_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::LE);
+}
+
+bool TranslatorVisitor::FCMLT_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::LT);
+}
+
+bool TranslatorVisitor::FCVTAS_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, Signedness::Signed);
+}
+
+bool TranslatorVisitor::FCVTAU_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FCVTMS_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, Signedness::Signed);
+}
+
+bool TranslatorVisitor::FCVTMU_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FCVTNS_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, Signedness::Signed);
+}
+
+bool TranslatorVisitor::FCVTNU_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FCVTPS_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, Signedness::Signed);
+}
+
+bool TranslatorVisitor::FCVTPU_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FCVTXN_1(bool sz, Vec Vn, Vec Vd) {
+ if (!sz) {
+ return ReservedValue();
+ }
+
+ const IR::U64 element = V_scalar(64, Vn);
+ const IR::U32 result = ir.FPDoubleToSingle(element, FP::RoundingMode::ToOdd);
+
+ V_scalar(32, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTZS_int_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsZero, Signedness::Signed);
+}
+
+bool TranslatorVisitor::FCVTZU_int_2(bool sz, Vec Vn, Vec Vd) {
+ return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsZero, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FRECPE_1(Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+
+ const IR::U16 operand = V_scalar(esize, Vn);
+ const IR::U16 result = ir.FPRecipEstimate(operand);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPE_2(bool sz, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand = V_scalar(esize, Vn);
+ const IR::U32U64 result = ir.FPRecipEstimate(operand);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPX_1(Vec Vn, Vec Vd) {
+ const IR::U16 operand = V_scalar(16, Vn);
+ const IR::U16 result = ir.FPRecipExponent(operand);
+
+ V_scalar(16, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPX_2(bool sz, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand = V_scalar(esize, Vn);
+ const IR::U32U64 result = ir.FPRecipExponent(operand);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTE_1(Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+
+ const IR::U16 operand = V_scalar(esize, Vn);
+ const IR::U16 result = ir.FPRSqrtEstimate(operand);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTE_2(bool sz, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 operand = V_scalar(esize, Vn);
+ const IR::U32U64 result = ir.FPRSqrtEstimate(operand);
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::NEG_1(Imm<2> size, Vec Vn, Vec Vd) {
+ if (size != 0b11) {
+ return ReservedValue();
+ }
+
+ const IR::U64 operand = V_scalar(64, Vn);
+ const IR::U64 result = ir.Sub(ir.Imm64(0), operand);
+
+ V_scalar(64, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SCVTF_int_2(bool sz, Vec Vn, Vec Vd) {
+ const auto esize = sz ? 64 : 32;
+
+ const IR::U32U64 element = V_scalar(esize, Vn);
+ const IR::U32U64 result = esize == 32
+ ? IR::U32U64(ir.FPSignedFixedToSingle(element, 0, ir.current_location->FPCR().RMode()))
+ : IR::U32U64(ir.FPSignedFixedToDouble(element, 0, ir.current_location->FPCR().RMode()));
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQABS_1(Imm<2> size, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 result = ir.VectorSignedSaturatedAbs(esize, operand);
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQNEG_1(Imm<2> size, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 result = ir.VectorSignedSaturatedNeg(esize, operand);
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQXTN_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorSignedSaturatedNarrowToSigned);
+}
+
+bool TranslatorVisitor::SQXTUN_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorSignedSaturatedNarrowToUnsigned);
+}
+
+bool TranslatorVisitor::SUQADD_1(Imm<2> size, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vn), 0));
+ const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vd), 0));
+ const IR::U128 result = ir.VectorSignedSaturatedAccumulateUnsigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UCVTF_int_2(bool sz, Vec Vn, Vec Vd) {
+ const auto esize = sz ? 64 : 32;
+
+ const IR::U32U64 element = V_scalar(esize, Vn);
+ const IR::U32U64 result = esize == 32
+ ? IR::U32U64(ir.FPUnsignedFixedToSingle(element, 0, ir.current_location->FPCR().RMode()))
+ : IR::U32U64(ir.FPUnsignedFixedToDouble(element, 0, ir.current_location->FPCR().RMode()));
+
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQXTN_1(Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorUnsignedSaturatedNarrow);
+}
+
+bool TranslatorVisitor::USQADD_1(Imm<2> size, Vec Vn, Vec Vd) {
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vn), 0));
+ const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vd), 0));
+ const IR::U128 result = ir.VectorUnsignedSaturatedAccumulateSigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp
new file mode 100644
index 0000000000..dbbc4ce12c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp
@@ -0,0 +1,168 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <utility>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+std::pair<size_t, Vec> Combine(Imm<2> size, Imm<1> H, Imm<1> L, Imm<1> M, Imm<4> Vmlo) {
+ if (size == 0b01) {
+ return {concatenate(H, L, M).ZeroExtend(), Vmlo.ZeroExtend<Vec>()};
+ }
+
+ return {concatenate(H, L).ZeroExtend(), concatenate(M, Vmlo).ZeroExtend<Vec>()};
+}
+
+enum class ExtraBehavior {
+ None,
+ Accumulate,
+ Subtract,
+ MultiplyExtended,
+};
+
+bool MultiplyByElement(TranslatorVisitor& v, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
+ if (sz && L == 1) {
+ return v.ReservedValue();
+ }
+
+ const size_t idxdsize = H == 1 ? 128 : 64;
+ const size_t index = sz ? H.ZeroExtend() : concatenate(H, L).ZeroExtend();
+ const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U32U64 element = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index);
+ const IR::U32U64 result = [&]() -> IR::U32U64 {
+ IR::U32U64 operand1 = v.V_scalar(esize, Vn);
+
+ if (extra_behavior == ExtraBehavior::None) {
+ return v.ir.FPMul(operand1, element);
+ }
+
+ if (extra_behavior == ExtraBehavior::MultiplyExtended) {
+ return v.ir.FPMulX(operand1, element);
+ }
+
+ if (extra_behavior == ExtraBehavior::Subtract) {
+ operand1 = v.ir.FPNeg(operand1);
+ }
+
+ const IR::U32U64 operand2 = v.V_scalar(esize, Vd);
+ return v.ir.FPMulAdd(operand2, operand1, element);
+ }();
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool MultiplyByElementHalfPrecision(TranslatorVisitor& v, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
+ const size_t esize = 16;
+ const size_t idxsize = H == 1 ? 128 : 64;
+ const size_t index = concatenate(H, L, M).ZeroExtend();
+
+ const auto Vm = Vmlo.ZeroExtend<Vec>();
+ const IR::U16 element = v.ir.VectorGetElement(esize, v.V(idxsize, Vm), index);
+ const IR::U16 result = [&]() -> IR::U16 {
+ IR::U16 operand1 = v.V_scalar(esize, Vn);
+
+ // TODO: Currently we don't implement half-precision paths
+ // for regular multiplication and extended multiplication.
+
+ if (extra_behavior == ExtraBehavior::None) {
+ ASSERT_FALSE("half-precision option unimplemented");
+ }
+
+ if (extra_behavior == ExtraBehavior::MultiplyExtended) {
+ ASSERT_FALSE("half-precision option unimplemented");
+ }
+
+ if (extra_behavior == ExtraBehavior::Subtract) {
+ operand1 = v.ir.FPNeg(operand1);
+ }
+
+ const IR::U16 operand2 = v.V_scalar(esize, Vd);
+ return v.ir.FPMulAdd(operand2, operand1, element);
+ }();
+
+ v.V_scalar(esize, Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::FMLA_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::FMLS_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
+}
+
+bool TranslatorVisitor::FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
+}
+
+bool TranslatorVisitor::FMUL_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None);
+}
+
+bool TranslatorVisitor::FMULX_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::MultiplyExtended);
+}
+
+bool TranslatorVisitor::SQDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::UAny operand1 = V_scalar(esize, Vn);
+ const IR::UAny operand2 = ir.VectorGetElement(esize, V(128, Vm), index);
+ const auto result = ir.SignedSaturatedDoublingMultiplyReturnHigh(operand1, operand2);
+ V_scalar(esize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQRDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 operand2 = V(128, Vm);
+ const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, broadcast);
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQDMULL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
+ const IR::U128 operand2 = V(128, Vm);
+ const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, broadcast);
+
+ V(128, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp
new file mode 100644
index 0000000000..795bfcdb8c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp
@@ -0,0 +1,149 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+IR::U32 SHAchoose(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+ return ir.Eor(ir.And(ir.Eor(y, z), x), z);
+}
+
+IR::U32 SHAmajority(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+ return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
+}
+
+IR::U32 SHAparity(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+ return ir.Eor(ir.Eor(y, z), x);
+}
+
+using SHA1HashUpdateFunction = IR::U32(IREmitter&, IR::U32, IR::U32, IR::U32);
+
+IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFunction fn) {
+ IR::U128 x = ir.GetQ(Vd);
+ IR::U32 y = ir.VectorGetElement(32, ir.GetQ(Vn), 0);
+ const IR::U128 w = ir.GetQ(Vm);
+
+ for (size_t i = 0; i < 4; i++) {
+ const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
+ const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
+ const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
+ const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
+ const IR::U32 t = fn(ir, after_low_x, before_high_x, high_x);
+ const IR::U32 w_segment = ir.VectorGetElement(32, w, i);
+
+ y = ir.Add(ir.Add(ir.Add(y, ir.RotateRight(low_x, ir.Imm8(27))), t), w_segment);
+ x = ir.VectorSetElement(32, x, 1, ir.RotateRight(after_low_x, ir.Imm8(2)));
+
+ // Move each 32-bit element to the left once
+ // e.g. [3, 2, 1, 0], becomes [2, 1, 0, 3]
+ const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96);
+ x = ir.VectorSetElement(32, shuffled_x, 0, y);
+ y = high_x;
+ }
+
+ return x;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAchoose);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA1M(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAmajority);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA1P(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAparity);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA1SU0(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 d = ir.GetQ(Vd);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ IR::U128 result = [&] {
+ const IR::U64 d_high = ir.VectorGetElement(64, d, 1);
+ const IR::U64 n_low = ir.VectorGetElement(64, n, 0);
+ const IR::U128 zero = ir.ZeroVector();
+
+ const IR::U128 tmp1 = ir.VectorSetElement(64, zero, 0, d_high);
+ return ir.VectorSetElement(64, tmp1, 1, n_low);
+ }();
+
+ result = ir.VectorEor(ir.VectorEor(result, d), m);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA1SU1(Vec Vn, Vec Vd) {
+ const IR::U128 d = ir.GetQ(Vd);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ // Shuffle down the whole vector and zero out the top 32 bits
+ const IR::U128 shuffled_n = ir.VectorSetElement(32, ir.VectorRotateWholeVectorRight(n, 32), 3, ir.Imm32(0));
+ const IR::U128 t = ir.VectorEor(d, shuffled_n);
+ const IR::U128 rotated_t = ir.VectorRotateLeft(32, t, 1);
+
+ const IR::U32 low_rotated_t = ir.RotateRight(ir.VectorGetElement(32, rotated_t, 0), ir.Imm8(31));
+ const IR::U32 high_t = ir.VectorGetElement(32, rotated_t, 3);
+ const IR::U128 result = ir.VectorSetElement(32, rotated_t, 3, ir.Eor(low_rotated_t, high_t));
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA1H(Vec Vn, Vec Vd) {
+ const IR::U128 data = ir.GetS(Vn);
+
+ const IR::U128 result = ir.VectorOr(ir.VectorLogicalShiftLeft(32, data, 30),
+ ir.VectorLogicalShiftRight(32, data, 2));
+
+ ir.SetS(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) {
+ const IR::U128 x = ir.GetQ(Vd);
+ const IR::U128 y = ir.GetQ(Vn);
+
+ const IR::U128 result = ir.SHA256MessageSchedule0(x, y);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 x = ir.GetQ(Vd);
+ const IR::U128 y = ir.GetQ(Vn);
+ const IR::U128 z = ir.GetQ(Vm);
+
+ const IR::U128 result = ir.SHA256MessageSchedule1(x, y, z);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), true);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), false);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp
new file mode 100644
index 0000000000..766109bfee
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp
@@ -0,0 +1,300 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+IR::U64 MakeSig(IREmitter& ir, IR::U64 data, u8 first_rot_amount, u8 second_rot_amount, u8 shift_amount) {
+ const IR::U64 tmp1 = ir.RotateRight(data, ir.Imm8(first_rot_amount));
+ const IR::U64 tmp2 = ir.RotateRight(data, ir.Imm8(second_rot_amount));
+ const IR::U64 tmp3 = ir.LogicalShiftRight(data, ir.Imm8(shift_amount));
+
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+IR::U64 MakeMNSig(IREmitter& ir, IR::U64 data, u8 first_rot_amount, u8 second_rot_amount, u8 third_rot_amount) {
+ const IR::U64 tmp1 = ir.RotateRight(data, ir.Imm8(first_rot_amount));
+ const IR::U64 tmp2 = ir.RotateRight(data, ir.Imm8(second_rot_amount));
+ const IR::U64 tmp3 = ir.RotateRight(data, ir.Imm8(third_rot_amount));
+
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+enum class SHA512HashPart {
+ Part1,
+ Part2,
+};
+
+IR::U128 SHA512Hash(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA512HashPart part) {
+ const IR::U128 x = ir.GetQ(Vn);
+ const IR::U128 y = ir.GetQ(Vm);
+ const IR::U128 w = ir.GetQ(Vd);
+
+ const IR::U64 lower_x = ir.VectorGetElement(64, x, 0);
+ const IR::U64 upper_x = ir.VectorGetElement(64, x, 1);
+
+ const IR::U64 lower_y = ir.VectorGetElement(64, y, 0);
+ const IR::U64 upper_y = ir.VectorGetElement(64, y, 1);
+
+ const auto make_sigma = [&](IR::U64 data) {
+ if (part == SHA512HashPart::Part1) {
+ return MakeMNSig(ir, data, 14, 18, 41);
+ }
+ return MakeMNSig(ir, data, 28, 34, 39);
+ };
+
+ const auto make_partial_half = [&](IR::U64 a, IR::U64 b, IR::U64 c) {
+ const IR::U64 tmp1 = ir.And(a, b);
+
+ if (part == SHA512HashPart::Part1) {
+ const IR::U64 tmp2 = ir.AndNot(c, a);
+ return ir.Eor(tmp1, tmp2);
+ }
+
+ const IR::U64 tmp2 = ir.And(a, c);
+ const IR::U64 tmp3 = ir.And(upper_y, lower_y);
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+ };
+
+ const IR::U64 Vtmp = [&] {
+ const IR::U64 partial = [&] {
+ if (part == SHA512HashPart::Part1) {
+ return make_partial_half(upper_y, lower_x, upper_x);
+ }
+ return make_partial_half(lower_x, upper_y, lower_y);
+ }();
+ const IR::U64 upper_w = ir.VectorGetElement(64, w, 1);
+ const IR::U64 sig = [&] {
+ if (part == SHA512HashPart::Part1) {
+ return make_sigma(upper_y);
+ }
+ return make_sigma(lower_y);
+ }();
+
+ return ir.Add(partial, ir.Add(sig, upper_w));
+ }();
+
+ const IR::U128 low_result = [&] {
+ const IR::U64 tmp = [&]() -> IR::U64 {
+ if (part == SHA512HashPart::Part1) {
+ return ir.Add(Vtmp, lower_y);
+ }
+ return Vtmp;
+ }();
+ const IR::U64 partial = [&] {
+ if (part == SHA512HashPart::Part1) {
+ return make_partial_half(tmp, upper_y, lower_x);
+ }
+ return make_partial_half(Vtmp, lower_y, upper_y);
+ }();
+ const IR::U64 sig = make_sigma(tmp);
+ const IR::U64 lower_w = ir.VectorGetElement(64, w, 0);
+
+ return ir.ZeroExtendToQuad(ir.Add(partial, ir.Add(sig, lower_w)));
+ }();
+
+ return ir.VectorSetElement(64, low_result, 1, Vtmp);
+}
+
+enum class SM4RotationType {
+ SM4E,
+ SM4EKEY
+};
+
+IR::U32 SM4Rotation(IREmitter& ir, IR::U32 intval, IR::U32 round_result_low_word, SM4RotationType type) {
+ if (type == SM4RotationType::SM4E) {
+ const IR::U32 tmp1 = ir.RotateRight(intval, ir.Imm8(30));
+ const IR::U32 tmp2 = ir.RotateRight(intval, ir.Imm8(22));
+ const IR::U32 tmp3 = ir.RotateRight(intval, ir.Imm8(14));
+ const IR::U32 tmp4 = ir.RotateRight(intval, ir.Imm8(8));
+ const IR::U32 tmp5 = ir.Eor(intval, ir.Eor(tmp1, ir.Eor(tmp2, ir.Eor(tmp3, tmp4))));
+
+ return ir.Eor(tmp5, round_result_low_word);
+ }
+
+ const IR::U32 tmp1 = ir.RotateRight(intval, ir.Imm8(19));
+ const IR::U32 tmp2 = ir.RotateRight(intval, ir.Imm8(9));
+ return ir.Eor(round_result_low_word, ir.Eor(intval, ir.Eor(tmp1, tmp2)));
+}
+
+IR::U128 SM4Hash(IREmitter& ir, Vec Vn, Vec Vd, SM4RotationType type) {
+ const IR::U128 n = ir.GetQ(Vn);
+ IR::U128 roundresult = ir.GetQ(Vd);
+
+ for (size_t i = 0; i < 4; i++) {
+ const IR::U32 round_key = ir.VectorGetElement(32, n, i);
+
+ const IR::U32 upper_round = ir.VectorGetElement(32, roundresult, 3);
+ const IR::U32 before_upper_round = ir.VectorGetElement(32, roundresult, 2);
+ const IR::U32 after_lower_round = ir.VectorGetElement(32, roundresult, 1);
+
+ IR::U128 intval_vec = ir.ZeroExtendToQuad(ir.Eor(upper_round, ir.Eor(before_upper_round, ir.Eor(after_lower_round, round_key))));
+
+ for (size_t j = 0; j < 4; j++) {
+ const IR::U8 byte_element = ir.VectorGetElement(8, intval_vec, j);
+ intval_vec = ir.VectorSetElement(8, intval_vec, j, ir.SM4AccessSubstitutionBox(byte_element));
+ }
+
+ const IR::U32 intval_low_word = ir.VectorGetElement(32, intval_vec, 0);
+ const IR::U32 round_result_low_word = ir.VectorGetElement(32, roundresult, 0);
+ const IR::U32 intval = SM4Rotation(ir, intval_low_word, round_result_low_word, type);
+ roundresult = ir.VectorRotateWholeVectorRight(roundresult, 32);
+ roundresult = ir.VectorSetElement(32, roundresult, 3, intval);
+ }
+
+ return roundresult;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::SHA512SU0(Vec Vn, Vec Vd) {
+ const IR::U128 x = ir.GetQ(Vn);
+ const IR::U128 w = ir.GetQ(Vd);
+
+ const IR::U64 lower_x = ir.VectorGetElement(64, x, 0);
+ const IR::U64 lower_w = ir.VectorGetElement(64, w, 0);
+ const IR::U64 upper_w = ir.VectorGetElement(64, w, 1);
+
+ const auto make_sig0 = [&](IR::U64 data) {
+ return MakeSig(ir, data, 1, 8, 7);
+ };
+
+ const IR::U128 low_result = ir.ZeroExtendToQuad(ir.Add(lower_w, make_sig0(upper_w)));
+ const IR::U64 high_result = ir.Add(upper_w, make_sig0(lower_x));
+ const IR::U128 result = ir.VectorSetElement(64, low_result, 1, high_result);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA512SU1(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 x = ir.GetQ(Vn);
+ const IR::U128 y = ir.GetQ(Vm);
+ const IR::U128 w = ir.GetQ(Vd);
+
+ const auto make_sig1 = [&](IR::U64 data) {
+ return MakeSig(ir, data, 19, 61, 6);
+ };
+
+ const IR::U128 sig_vector = [&] {
+ const IR::U64 lower_x = ir.VectorGetElement(64, x, 0);
+ const IR::U64 upper_x = ir.VectorGetElement(64, x, 1);
+
+ const IR::U128 low_result = ir.ZeroExtendToQuad(make_sig1(lower_x));
+ return ir.VectorSetElement(64, low_result, 1, make_sig1(upper_x));
+ }();
+
+ const IR::U128 result = ir.VectorAdd(64, w, ir.VectorAdd(64, y, sig_vector));
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA512H(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = SHA512Hash(ir, Vm, Vn, Vd, SHA512HashPart::Part1);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHA512H2(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 result = SHA512Hash(ir, Vm, Vn, Vd, SHA512HashPart::Part2);
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::RAX1(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 rotated_m = ir.VectorRotateLeft(64, m, 1);
+ const IR::U128 result = ir.VectorEor(n, rotated_m);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::XAR(Vec Vm, Imm<6> imm6, Vec Vn, Vec Vd) {
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 tmp = ir.VectorEor(m, n);
+ const IR::U128 result = ir.VectorRotateRight(64, tmp, imm6.ZeroExtend<u8>());
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SM3PARTW1(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 d = ir.GetQ(Vd);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 eor_d_n = ir.VectorEor(d, n);
+
+ const IR::U128 result_low_three_words = [&] {
+ // Move the top-most 3 words down one element (i.e. [3, 2, 1, 0] -> [0, 3, 2, 1])
+ const IR::U128 shuffled_m = ir.VectorRotateWholeVectorRight(m, 32);
+
+ // We treat the uppermost word as junk data and don't touch/use it explicitly for now.
+ // Given we don't do anything with it yet, the fact we EOR into it doesn't matter.
+ return ir.VectorEor(eor_d_n, ir.VectorRotateLeft(32, shuffled_m, 15));
+ }();
+
+ IR::U128 result = result_low_three_words;
+ for (size_t i = 0; i < 4; i++) {
+ if (i == 3) {
+ const IR::U32 top_eor_d_n = ir.VectorGetElement(32, eor_d_n, 3);
+ const IR::U32 low_result_word = ir.VectorGetElement(32, result, 0);
+ const IR::U32 top_result_word = ir.Eor(top_eor_d_n, ir.RotateRight(low_result_word, ir.Imm8(17)));
+
+ // Now the uppermost word is well-defined data.
+ result = ir.VectorSetElement(32, result, 3, top_result_word);
+ }
+
+ const IR::U32 word = ir.VectorGetElement(32, result, i);
+ const IR::U32 modified = ir.Eor(word, ir.Eor(ir.RotateRight(word, ir.Imm8(17)),
+ ir.RotateRight(word, ir.Imm8(9))));
+
+ result = ir.VectorSetElement(32, result, i, modified);
+ }
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SM3PARTW2(Vec Vm, Vec Vn, Vec Vd) {
+ const IR::U128 d = ir.GetQ(Vd);
+ const IR::U128 m = ir.GetQ(Vm);
+ const IR::U128 n = ir.GetQ(Vn);
+
+ const IR::U128 temp = ir.VectorEor(n, ir.VectorRotateLeft(32, m, 7));
+ const IR::U128 temp_result = ir.VectorEor(d, temp);
+ const IR::U32 temp2 = [&] {
+ const IR::U32 rotate1 = ir.RotateRight(ir.VectorGetElement(32, temp, 0), ir.Imm8(17));
+ const IR::U32 rotate2 = ir.RotateRight(rotate1, ir.Imm8(17));
+ const IR::U32 rotate3 = ir.RotateRight(rotate1, ir.Imm8(9));
+
+ return ir.Eor(rotate1, ir.Eor(rotate2, rotate3));
+ }();
+
+ const IR::U32 high_temp_result = ir.VectorGetElement(32, temp_result, 3);
+ const IR::U32 replacement = ir.Eor(high_temp_result, temp2);
+ const IR::U128 result = ir.VectorSetElement(32, temp_result, 3, replacement);
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SM4E(Vec Vn, Vec Vd) {
+ ir.SetQ(Vd, SM4Hash(ir, Vn, Vd, SM4RotationType::SM4E));
+ return true;
+}
+
+bool TranslatorVisitor::SM4EKEY(Vec Vm, Vec Vn, Vec Vd) {
+ ir.SetQ(Vd, SM4Hash(ir, Vm, Vn, SM4RotationType::SM4EKEY));
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp
new file mode 100644
index 0000000000..41b0952249
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp
@@ -0,0 +1,402 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/bit/bit_count.hpp>
+
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class Rounding {
+ None,
+ Round
+};
+
+enum class Accumulating {
+ None,
+ Accumulate
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+enum class Narrowing {
+ Truncation,
+ SaturateToUnsigned,
+ SaturateToSigned,
+};
+
+enum class SaturatingShiftLeftType {
+ Signed,
+ Unsigned,
+ SignedWithUnsignedSaturation,
+};
+
+enum class FloatConversionDirection {
+ FixedToFloat,
+ FloatToFixed,
+};
+
+IR::U128 PerformRoundingCorrection(TranslatorVisitor& v, size_t esize, u64 round_value, IR::U128 original, IR::U128 shifted) {
+ const IR::U128 round_const = v.ir.VectorBroadcast(esize, v.I(esize, round_value));
+ const IR::U128 round_correction = v.ir.VectorEqual(esize, v.ir.VectorAnd(original, round_const), round_const);
+ return v.ir.VectorSub(esize, shifted, round_correction);
+}
+
+bool ShiftRight(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Rounding rounding, Accumulating accumulating, Signedness signedness) {
+ if (immh == 0b0000) {
+ return v.DecodeError();
+ }
+
+ if (immh.Bit<3>() && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+
+ const u8 shift_amount = static_cast<u8>(2 * esize) - concatenate(immh, immb).ZeroExtend<u8>();
+
+ const IR::U128 operand = v.V(datasize, Vn);
+
+ IR::U128 result = [&] {
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorArithmeticShiftRight(esize, operand, shift_amount);
+ }
+ return v.ir.VectorLogicalShiftRight(esize, operand, shift_amount);
+ }();
+
+ if (rounding == Rounding::Round) {
+ const u64 round_value = 1ULL << (shift_amount - 1);
+ result = PerformRoundingCorrection(v, esize, round_value, operand, result);
+ }
+
+ if (accumulating == Accumulating::Accumulate) {
+ const IR::U128 accumulator = v.V(datasize, Vd);
+ result = v.ir.VectorAdd(esize, result, accumulator);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool ShiftRightNarrowing(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Rounding rounding, Narrowing narrowing, Signedness signedness) {
+ if (immh == 0b0000) {
+ return v.DecodeError();
+ }
+
+ if (immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t source_esize = 2 * esize;
+ const size_t part = Q ? 1 : 0;
+
+ const u8 shift_amount = static_cast<u8>(source_esize - concatenate(immh, immb).ZeroExtend());
+
+ const IR::U128 operand = v.V(128, Vn);
+
+ IR::U128 wide_result = [&] {
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorArithmeticShiftRight(source_esize, operand, shift_amount);
+ }
+ return v.ir.VectorLogicalShiftRight(source_esize, operand, shift_amount);
+ }();
+
+ if (rounding == Rounding::Round) {
+ const u64 round_value = 1ULL << (shift_amount - 1);
+ wide_result = PerformRoundingCorrection(v, source_esize, round_value, operand, wide_result);
+ }
+
+ const IR::U128 result = [&] {
+ switch (narrowing) {
+ case Narrowing::Truncation:
+ return v.ir.VectorNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToUnsigned:
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result);
+ }
+ return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result);
+ case Narrowing::SaturateToSigned:
+ ASSERT(signedness == Signedness::Signed);
+ return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result);
+ }
+ UNREACHABLE();
+ }();
+
+ v.Vpart(64, Vd, part, result);
+ return true;
+}
+
+bool ShiftLeftLong(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness signedness) {
+ if (immh == 0b0000) {
+ return v.DecodeError();
+ }
+
+ if (immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = 64;
+ const size_t part = Q ? 1 : 0;
+
+ const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize);
+
+ const IR::U128 operand = v.Vpart(datasize, Vn, part);
+ const IR::U128 expanded_operand = [&] {
+ if (signedness == Signedness::Signed) {
+ return v.ir.VectorSignExtend(esize, operand);
+ }
+ return v.ir.VectorZeroExtend(esize, operand);
+ }();
+ const IR::U128 result = v.ir.VectorLogicalShiftLeft(2 * esize, expanded_operand, shift_amount);
+
+ v.V(2 * datasize, Vd, result);
+ return true;
+}
+
+bool SaturatingShiftLeft(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, SaturatingShiftLeftType type) {
+ if (!Q && immh.Bit<3>()) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+ const size_t shift = concatenate(immh, immb).ZeroExtend() - esize;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 shift_vec = v.ir.VectorBroadcast(esize, v.I(esize, shift));
+ const IR::U128 result = [&] {
+ if (type == SaturatingShiftLeftType::Signed) {
+ return v.ir.VectorSignedSaturatedShiftLeft(esize, operand, shift_vec);
+ }
+
+ if (type == SaturatingShiftLeftType::Unsigned) {
+ return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand, shift_vec);
+ }
+
+ return v.ir.VectorSignedSaturatedShiftLeftUnsigned(esize, operand, static_cast<u8>(shift));
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool ConvertFloat(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness signedness, FloatConversionDirection direction, FP::RoundingMode rounding_mode) {
+ if (immh == 0b0000) {
+ return v.DecodeError();
+ }
+
+ if (immh == 0b0001 || immh == 0b0010 || immh == 0b0011) {
+ return v.ReservedValue();
+ }
+
+ if (immh.Bit<3>() && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+
+ const u8 fbits = static_cast<u8>(esize * 2) - concatenate(immh, immb).ZeroExtend<u8>();
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 result = [&] {
+ switch (direction) {
+ case FloatConversionDirection::FixedToFloat:
+ return signedness == Signedness::Signed
+ ? v.ir.FPVectorFromSignedFixed(esize, operand, fbits, rounding_mode)
+ : v.ir.FPVectorFromUnsignedFixed(esize, operand, fbits, rounding_mode);
+ case FloatConversionDirection::FloatToFixed:
+ return signedness == Signedness::Signed
+ ? v.ir.FPVectorToSignedFixed(esize, operand, fbits, rounding_mode)
+ : v.ir.FPVectorToUnsignedFixed(esize, operand, fbits, rounding_mode);
+ }
+ UNREACHABLE();
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+bool TranslatorVisitor::SSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SHL_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ if (immh == 0b0000) {
+ return DecodeError();
+ }
+ if (immh.Bit<3>() && !Q) {
+ return ReservedValue();
+ }
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+
+ const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize);
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorLogicalShiftLeft(esize, operand, shift_amount);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::Truncation, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::RSHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::Truncation, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::SQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::Signed);
+}
+
+bool TranslatorVisitor::SQSHLU_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::SignedWithUnsignedSaturation);
+}
+
+bool TranslatorVisitor::SQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToSigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToSigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQRSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Signed);
+}
+
+bool TranslatorVisitor::UQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::Unsigned);
+}
+
+bool TranslatorVisitor::UQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::SSHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftLeftLong(*this, Q, immh, immb, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::URSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::URSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ShiftLeftLong(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::SRI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ if (immh == 0b0000) {
+ return DecodeError();
+ }
+
+ if (!Q && immh.Bit<3>()) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+
+ const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend<u8>());
+ const u64 mask = shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vd);
+
+ const IR::U128 shifted = ir.VectorLogicalShiftRight(esize, operand1, shift_amount);
+ const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
+ const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SLI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ if (immh == 0b0000) {
+ return DecodeError();
+ }
+
+ if (!Q && immh.Bit<3>()) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend());
+ const size_t datasize = Q ? 128 : 64;
+
+ const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize);
+ const u64 mask = mcl::bit::ones<u64>(esize) << shift_amount;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vd);
+
+ const IR::U128 shifted = ir.VectorLogicalShiftLeft(esize, operand1, shift_amount);
+ const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
+ const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode());
+}
+
+bool TranslatorVisitor::UCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode());
+}
+
+bool TranslatorVisitor::FCVTZS_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FCVTZU_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) {
+ return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp
new file mode 100644
index 0000000000..f267c13581
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp
@@ -0,0 +1,40 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <vector>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool TableLookup(TranslatorVisitor& v, bool Q, Vec Vm, Imm<2> len, bool is_tbl, size_t Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::Table table = v.ir.VectorTable([&] {
+ std::vector<IR::U128> result;
+ for (size_t i = 0; i < len.ZeroExtend<size_t>() + 1; ++i) {
+ result.emplace_back(v.ir.GetQ(static_cast<Vec>((Vn + i) % 32)));
+ }
+ return result;
+ }());
+
+ const IR::U128 indicies = v.ir.GetQ(Vm);
+ const IR::U128 defaults = is_tbl ? v.ir.ZeroVector() : v.ir.GetQ(Vd);
+
+ const IR::U128 result = v.ir.VectorTableLookup(defaults, table, indicies);
+
+ v.V(datasize, Vd, datasize == 128 ? result : v.ir.VectorZeroUpper(result));
+ return true;
+}
+
+bool TranslatorVisitor::TBL(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd) {
+ return TableLookup(*this, Q, Vm, len, true, Vn, Vd);
+}
+
+bool TranslatorVisitor::TBX(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd) {
+ return TableLookup(*this, Q, Vm, len, false, Vn, Vd);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp
new file mode 100644
index 0000000000..8cc677b652
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp
@@ -0,0 +1,256 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class AbsoluteDifferenceBehavior {
+ None,
+ Accumulate
+};
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, AbsoluteDifferenceBehavior behavior, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+
+ const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vn, Q));
+ const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vm, Q));
+ IR::U128 result = sign == Signedness::Signed ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2)
+ : v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2);
+
+ if (behavior == AbsoluteDifferenceBehavior::Accumulate) {
+ const IR::U128 data = v.V(2 * datasize, Vd);
+ result = v.ir.VectorAdd(2 * esize, result, data);
+ }
+
+ v.V(2 * datasize, Vd, result);
+ return true;
+}
+
+enum class MultiplyLongBehavior {
+ None,
+ Accumulate,
+ Subtract
+};
+
+bool MultiplyLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MultiplyLongBehavior behavior, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t doubled_esize = 2 * esize;
+ const size_t datasize = 64;
+ const size_t doubled_datasize = datasize * 2;
+
+ IR::U128 result = [&] {
+ const auto reg_n = v.Vpart(datasize, Vn, Q);
+ const auto reg_m = v.Vpart(datasize, Vm, Q);
+
+ return sign == Signedness::Signed
+ ? v.ir.VectorMultiplySignedWiden(esize, reg_n, reg_m)
+ : v.ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m);
+ }();
+
+ if (behavior == MultiplyLongBehavior::Accumulate) {
+ const IR::U128 addend = v.V(doubled_datasize, Vd);
+ result = v.ir.VectorAdd(doubled_esize, addend, result);
+ } else if (behavior == MultiplyLongBehavior::Subtract) {
+ const IR::U128 minuend = v.V(doubled_datasize, Vd);
+ result = v.ir.VectorSub(doubled_esize, minuend, result);
+ }
+
+ v.V(doubled_datasize, Vd, result);
+ return true;
+}
+
+enum class LongOperationBehavior {
+ Addition,
+ Subtraction
+};
+
+bool LongOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, LongOperationBehavior behavior, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t part = Q ? 1 : 0;
+
+ const auto get_operand = [&](Vec vec) {
+ const IR::U128 tmp = v.Vpart(64, vec, part);
+
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorSignExtend(esize, tmp);
+ }
+
+ return v.ir.VectorZeroExtend(esize, tmp);
+ };
+
+ const IR::U128 operand1 = get_operand(Vn);
+ const IR::U128 operand2 = get_operand(Vm);
+ const IR::U128 result = [&] {
+ if (behavior == LongOperationBehavior::Addition) {
+ return v.ir.VectorAdd(esize * 2, operand1, operand2);
+ }
+
+ return v.ir.VectorSub(esize * 2, operand1, operand2);
+ }();
+
+ v.V(128, Vd, result);
+ return true;
+}
+
+enum class WideOperationBehavior {
+ Addition,
+ Subtraction
+};
+
+bool WideOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, WideOperationBehavior behavior, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t part = Q ? 1 : 0;
+
+ const IR::U128 operand1 = v.V(128, Vn);
+ const IR::U128 operand2 = [&] {
+ const IR::U128 tmp = v.Vpart(64, Vm, part);
+
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorSignExtend(esize, tmp);
+ }
+
+ return v.ir.VectorZeroExtend(esize, tmp);
+ }();
+ const IR::U128 result = [&] {
+ if (behavior == WideOperationBehavior::Addition) {
+ return v.ir.VectorAdd(esize * 2, operand1, operand2);
+ }
+
+ return v.ir.VectorSub(esize * 2, operand1, operand2);
+ }();
+
+ v.V(128, Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::PMULL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b01 || size == 0b10) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+
+ const IR::U128 operand1 = Vpart(datasize, Vn, Q);
+ const IR::U128 operand2 = Vpart(datasize, Vm, Q);
+ const IR::U128 result = ir.VectorPolynomialMultiplyLong(esize, operand1, operand2);
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Addition, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Addition, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Subtract, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Subtraction, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Subtraction, Signedness::Signed);
+}
+
+bool TranslatorVisitor::UADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Addition, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Addition, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Subtract, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::None, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Subtraction, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Subtraction, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::SQDMULL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t part = Q ? 1 : 0;
+
+ const IR::U128 operand1 = Vpart(64, Vn, part);
+ const IR::U128 operand2 = Vpart(64, Vm, part);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, operand2);
+
+ V(128, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp
new file mode 100644
index 0000000000..5c8bf13aeb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp
@@ -0,0 +1,1240 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class Operation {
+ Add,
+ Subtract,
+};
+
+enum class ExtraBehavior {
+ None,
+ Round
+};
+
+bool HighNarrowingOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Operation op, ExtraBehavior behavior) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t part = Q;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t doubled_esize = 2 * esize;
+
+ const IR::U128 operand1 = v.ir.GetQ(Vn);
+ const IR::U128 operand2 = v.ir.GetQ(Vm);
+ IR::U128 wide = [&] {
+ if (op == Operation::Add) {
+ return v.ir.VectorAdd(doubled_esize, operand1, operand2);
+ }
+ return v.ir.VectorSub(doubled_esize, operand1, operand2);
+ }();
+
+ if (behavior == ExtraBehavior::Round) {
+ const u64 round_const = 1ULL << (esize - 1);
+ const IR::U128 round_operand = v.ir.VectorBroadcast(doubled_esize, v.I(doubled_esize, round_const));
+ wide = v.ir.VectorAdd(doubled_esize, wide, round_operand);
+ }
+
+ const IR::U128 result = v.ir.VectorNarrow(doubled_esize,
+ v.ir.VectorLogicalShiftRight(doubled_esize, wide, static_cast<u8>(esize)));
+
+ v.Vpart(64, Vd, part, result);
+ return true;
+}
+
+enum class AbsDiffExtraBehavior {
+ None,
+ Accumulate
+};
+
+bool SignedAbsoluteDifference(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, AbsDiffExtraBehavior behavior) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ const IR::U128 tmp = v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2);
+
+ if (behavior == AbsDiffExtraBehavior::Accumulate) {
+ const IR::U128 d = v.V(datasize, Vd);
+ return v.ir.VectorAdd(esize, d, tmp);
+ }
+
+ return tmp;
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool RoundingHalvingAdd(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vm);
+ const IR::U128 operand2 = v.V(datasize, Vn);
+ const IR::U128 result = sign == Signedness::Signed ? v.ir.VectorRoundingHalvingAddSigned(esize, operand1, operand2)
+ : v.ir.VectorRoundingHalvingAddUnsigned(esize, operand1, operand2);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool RoundingShiftLeft(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) {
+ if (size == 0b11 && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorRoundingShiftLeftSigned(esize, operand1, operand2);
+ }
+
+ return v.ir.VectorRoundingShiftLeftUnsigned(esize, operand1, operand2);
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class ComparisonType {
+ EQ,
+ GE,
+ AbsoluteGE,
+ GT,
+ AbsoluteGT
+};
+
+bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, ComparisonType type) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ switch (type) {
+ case ComparisonType::EQ:
+ return v.ir.FPVectorEqual(esize, operand1, operand2);
+ case ComparisonType::GE:
+ return v.ir.FPVectorGreaterEqual(esize, operand1, operand2);
+ case ComparisonType::AbsoluteGE:
+ return v.ir.FPVectorGreaterEqual(esize,
+ v.ir.FPVectorAbs(esize, operand1),
+ v.ir.FPVectorAbs(esize, operand2));
+ case ComparisonType::GT:
+ return v.ir.FPVectorGreater(esize, operand1, operand2);
+ case ComparisonType::AbsoluteGT:
+ return v.ir.FPVectorGreater(esize,
+ v.ir.FPVectorAbs(esize, operand1),
+ v.ir.FPVectorAbs(esize, operand2));
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class MinMaxOperation {
+ Min,
+ Max,
+};
+
+bool VectorMinMaxOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ switch (operation) {
+ case MinMaxOperation::Max:
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorMaxSigned(esize, operand1, operand2);
+ }
+ return v.ir.VectorMaxUnsigned(esize, operand1, operand2);
+
+ case MinMaxOperation::Min:
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorMinSigned(esize, operand1, operand2);
+ }
+ return v.ir.VectorMinUnsigned(esize, operand1, operand2);
+
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPMinMaxOperation(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ if (operation == MinMaxOperation::Min) {
+ return v.ir.FPVectorMin(esize, operand1, operand2);
+ }
+
+ return v.ir.FPVectorMax(esize, operand1, operand2);
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPMinMaxNumericOperation(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ if (operation == MinMaxOperation::Min) {
+ return v.ir.FPVectorMinNumeric(esize, operand1, operand2);
+ }
+
+ return v.ir.FPVectorMaxNumeric(esize, operand1, operand2);
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool PairedMinMaxOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation, Signedness sign) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ IR::U128 result = [&] {
+ switch (operation) {
+ case MinMaxOperation::Max:
+ if (sign == Signedness::Signed) {
+ return Q ? v.ir.VectorPairedMaxSigned(esize, operand1, operand2) : v.ir.VectorPairedMaxSignedLower(esize, operand1, operand2);
+ }
+ return Q ? v.ir.VectorPairedMaxUnsigned(esize, operand1, operand2) : v.ir.VectorPairedMaxUnsignedLower(esize, operand1, operand2);
+
+ case MinMaxOperation::Min:
+ if (sign == Signedness::Signed) {
+ return Q ? v.ir.VectorPairedMinSigned(esize, operand1, operand2) : v.ir.VectorPairedMinSignedLower(esize, operand1, operand2);
+ }
+ return Q ? v.ir.VectorPairedMinUnsigned(esize, operand1, operand2) : v.ir.VectorPairedMinUnsignedLower(esize, operand1, operand2);
+
+ default:
+ UNREACHABLE();
+ }
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPPairedMinMax(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, IR::U32U64 (IREmitter::*fn)(const IR::U32U64&, const IR::U32U64&)) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+ const size_t elements = datasize / esize;
+ const size_t boundary = elements / 2;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ IR::U128 result = v.ir.ZeroVector();
+
+ const auto operation = [&](IR::U128 operand, size_t result_start_index) {
+ for (size_t i = 0; i < elements; i += 2, result_start_index++) {
+ const IR::UAny elem1 = v.ir.VectorGetElement(esize, operand, i);
+ const IR::UAny elem2 = v.ir.VectorGetElement(esize, operand, i + 1);
+ const IR::UAny result_elem = (v.ir.*fn)(elem1, elem2);
+
+ result = v.ir.VectorSetElement(esize, result, result_start_index, result_elem);
+ }
+ };
+
+ operation(operand1, 0);
+ operation(operand2, boundary);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool SaturatingArithmeticOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Operation op, Signedness sign) {
+ if (size == 0b11 && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+
+ const IR::U128 result = [&] {
+ if (sign == Signedness::Signed) {
+ if (op == Operation::Add) {
+ return v.ir.VectorSignedSaturatedAdd(esize, operand1, operand2);
+ }
+
+ return v.ir.VectorSignedSaturatedSub(esize, operand1, operand2);
+ }
+
+ if (op == Operation::Add) {
+ return v.ir.VectorUnsignedSaturatedAdd(esize, operand1, operand2);
+ }
+
+ return v.ir.VectorUnsignedSaturatedSub(esize, operand1, operand2);
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool SaturatingShiftLeft(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) {
+ if (size == 0b11 && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ const IR::U128 result = [&] {
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorSignedSaturatedShiftLeft(esize, operand1, operand2);
+ }
+
+ return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand1, operand2);
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+bool TranslatorVisitor::CMGT_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorGreaterSigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMGE_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ IR::U128 result = ir.VectorGreaterEqualSigned(esize, operand1, operand2);
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SignedAbsoluteDifference(*this, Q, size, Vm, Vn, Vd, AbsDiffExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::SABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SignedAbsoluteDifference(*this, Q, size, Vm, Vn, Vd, AbsDiffExtraBehavior::None);
+}
+
+bool TranslatorVisitor::SMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQRDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADD_vector(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+ const auto result = ir.VectorAdd(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MLA_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ const IR::U128 result = ir.VectorAdd(esize, ir.VectorMultiply(esize, operand1, operand2), operand3);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MUL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorMultiply(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, ExtraBehavior::None);
+}
+
+bool TranslatorVisitor::RADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, ExtraBehavior::Round);
+}
+
+bool TranslatorVisitor::SUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, ExtraBehavior::None);
+}
+
+bool TranslatorVisitor::RSUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, ExtraBehavior::Round);
+}
+
+bool TranslatorVisitor::SHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorHalvingAddSigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorHalvingSubSigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingHalvingAdd(*this, Q, size, Vm, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::UHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorHalvingAddUnsigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorHalvingSubUnsigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::URHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingHalvingAdd(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::ADDP_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = Q ? ir.VectorPairedAdd(esize, operand1, operand2) : ir.VectorPairedAddLower(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FABD_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorAbs(esize, ir.FPVectorSub(esize, operand1, operand2));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FACGE_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::AbsoluteGE);
+}
+
+bool TranslatorVisitor::FACGT_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::AbsoluteGT);
+}
+
+bool TranslatorVisitor::FADD_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorAdd(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMLA_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMLS_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 lhs = V(datasize, Vn);
+ const IR::U128 rhs = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorEqual(16, lhs, rhs);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::EQ);
+}
+
+bool TranslatorVisitor::FCMGE_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::GE);
+}
+
+bool TranslatorVisitor::FCMGT_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::GT);
+}
+
+bool TranslatorVisitor::AND_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+ const auto result = ir.VectorAnd(operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BIC_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+
+ IR::U128 result = ir.VectorAndNot(operand1, operand2);
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMHI_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorGreaterUnsigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMHS_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ IR::U128 result = ir.VectorGreaterEqualUnsigned(esize, operand1, operand2);
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMTST_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 anded = ir.VectorAnd(operand1, operand2);
+ const IR::U128 result = ir.VectorNot(ir.VectorEqual(esize, anded, ir.ZeroVector()));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorArithmeticVShift(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return SaturatingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::URSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return RoundingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::USHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorLogicalVShift(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 initial_dest = V(datasize, Vd);
+
+ const IR::U128 result = ir.VectorAdd(esize, initial_dest,
+ ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::UMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::FSUB_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorSub(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPS_3(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorRecipStepFused(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorRecipStepFused(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTS_3(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t esize = 16;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorRSqrtStepFused(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorRSqrtStepFused(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ORR_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+ const auto result = ir.VectorOr(operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::ORN_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+
+ auto result = ir.VectorOr(operand1, ir.VectorNot(operand2));
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::PMUL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size != 0b00) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.VectorPolynomialMultiply(operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+ const auto result = ir.VectorSub(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMEQ_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+
+ IR::U128 result = ir.VectorEqual(esize, operand1, operand2);
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::MLS_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+
+ const IR::U128 result = ir.VectorSub(esize, operand3, ir.VectorMultiply(esize, operand1, operand2));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::EOR_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vn);
+ const auto operand2 = V(datasize, Vm);
+ const auto result = ir.VectorEor(operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMAX_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPMinMaxOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Max);
+}
+
+bool TranslatorVisitor::FMAXNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPMinMaxNumericOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Max);
+}
+
+bool TranslatorVisitor::FMAXNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMaxNumeric);
+}
+
+bool TranslatorVisitor::FMAXP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMax);
+}
+
+bool TranslatorVisitor::FMIN_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPMinMaxOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Min);
+}
+
+bool TranslatorVisitor::FMINNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPMinMaxNumericOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Min);
+}
+
+bool TranslatorVisitor::FMINNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMinNumeric);
+}
+
+bool TranslatorVisitor::FMINP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMin);
+}
+
+bool TranslatorVisitor::FADDP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = Q ? ir.FPVectorPairedAdd(esize, operand1, operand2) : ir.FPVectorPairedAddLower(esize, operand1, operand2);
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMUL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorMul(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMULX_vec_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 result = ir.FPVectorMulX(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FDIV_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ IR::U128 result = ir.FPVectorDiv(esize, operand1, operand2);
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BIF(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vd);
+ const auto operand4 = V(datasize, Vn);
+ const auto operand3 = ir.VectorNot(V(datasize, Vm));
+ const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BIT(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand1 = V(datasize, Vd);
+ const auto operand4 = V(datasize, Vn);
+ const auto operand3 = V(datasize, Vm);
+ const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::BSL(bool Q, Vec Vm, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const auto operand4 = V(datasize, Vn);
+ const auto operand1 = V(datasize, Vm);
+ const auto operand3 = V(datasize, Vd);
+ const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3));
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp
new file mode 100644
index 0000000000..d18f4b10cb
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp
@@ -0,0 +1,174 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+
+using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&);
+
+bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, ExtensionFunction extension) {
+ if (size != 0b10) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const size_t elements = datasize / esize;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(datasize, Vm);
+ IR::U128 result = v.V(datasize, Vd);
+
+ for (size_t i = 0; i < elements; i++) {
+ IR::U32 res_element = v.ir.Imm32(0);
+
+ for (size_t j = 0; j < 4; j++) {
+ const IR::U32 elem1 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand1, 4 * i + j));
+ const IR::U32 elem2 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand2, 4 * i + j));
+
+ res_element = v.ir.Add(res_element, v.ir.Mul(elem1, elem2));
+ }
+
+ res_element = v.ir.Add(v.ir.VectorGetElement(32, result, i), res_element);
+ result = v.ir.VectorSetElement(32, result, i, res_element);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+bool TranslatorVisitor::SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return DotProduct(*this, Q, size, Vm, Vn, Vd, &IREmitter::SignExtendToWord);
+}
+
+bool TranslatorVisitor::UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) {
+ return DotProduct(*this, Q, size, Vm, Vn, Vd, &IREmitter::ZeroExtendToWord);
+}
+
+bool TranslatorVisitor::FCMLA_vec(bool Q, Imm<2> size, Vec Vm, Imm<2> rot, Vec Vn, Vec Vd) {
+ if (size == 0) {
+ return ReservedValue();
+ }
+
+ if (!Q && size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8U << size.ZeroExtend();
+
+ // TODO: Currently we don't support half-precision floating point
+ if (esize == 16) {
+ return InterpretThisInstruction();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t num_elements = datasize / esize;
+ const size_t num_iterations = num_elements / 2;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ IR::U128 result = ir.ZeroVector();
+
+ IR::U32U64 element1;
+ IR::U32U64 element2;
+ IR::U32U64 element3;
+ IR::U32U64 element4;
+ for (size_t e = 0; e < num_iterations; ++e) {
+ const size_t first = e * 2;
+ const size_t second = first + 1;
+
+ switch (rot.ZeroExtend()) {
+ case 0b00: // 0 degrees
+ element1 = ir.VectorGetElement(esize, operand2, first);
+ element2 = ir.VectorGetElement(esize, operand1, first);
+ element3 = ir.VectorGetElement(esize, operand2, second);
+ element4 = ir.VectorGetElement(esize, operand1, first);
+ break;
+ case 0b01: // 90 degrees
+ element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second));
+ element2 = ir.VectorGetElement(esize, operand1, second);
+ element3 = ir.VectorGetElement(esize, operand2, first);
+ element4 = ir.VectorGetElement(esize, operand1, second);
+ break;
+ case 0b10: // 180 degrees
+ element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first));
+ element2 = ir.VectorGetElement(esize, operand1, first);
+ element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second));
+ element4 = ir.VectorGetElement(esize, operand1, first);
+ break;
+ case 0b11: // 270 degrees
+ element1 = ir.VectorGetElement(esize, operand2, second);
+ element2 = ir.VectorGetElement(esize, operand1, second);
+ element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first));
+ element4 = ir.VectorGetElement(esize, operand1, second);
+ break;
+ }
+
+ const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first);
+ const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second);
+
+ result = ir.VectorSetElement(esize, result, first, ir.FPMulAdd(operand3_elem1, element2, element1));
+ result = ir.VectorSetElement(esize, result, second, ir.FPMulAdd(operand3_elem2, element4, element3));
+ }
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCADD_vec(bool Q, Imm<2> size, Vec Vm, Imm<1> rot, Vec Vn, Vec Vd) {
+ if (size == 0) {
+ return ReservedValue();
+ }
+
+ if (!Q && size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8U << size.ZeroExtend();
+
+ // TODO: Currently we don't support half-precision floating point
+ if (esize == 16) {
+ return InterpretThisInstruction();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t num_elements = datasize / esize;
+ const size_t num_iterations = num_elements / 2;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ IR::U128 result = ir.ZeroVector();
+
+ IR::U32U64 element1;
+ IR::U32U64 element3;
+ for (size_t e = 0; e < num_iterations; ++e) {
+ const size_t first = e * 2;
+ const size_t second = first + 1;
+
+ if (rot == 0) {
+ element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second));
+ element3 = ir.VectorGetElement(esize, operand2, first);
+ } else if (rot == 1) {
+ element1 = ir.VectorGetElement(esize, operand2, second);
+ element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first));
+ }
+
+ const IR::U32U64 operand1_elem1 = ir.VectorGetElement(esize, operand1, first);
+ const IR::U32U64 operand1_elem3 = ir.VectorGetElement(esize, operand1, second);
+
+ result = ir.VectorSetElement(esize, result, first, ir.FPAdd(operand1_elem1, element1));
+ result = ir.VectorSetElement(esize, result, second, ir.FPAdd(operand1_elem3, element3));
+ }
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp
new file mode 100644
index 0000000000..ca3e3b9591
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp
@@ -0,0 +1,848 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/common/fp/rounding_mode.h"
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+enum class ComparisonType {
+ EQ,
+ GE,
+ GT,
+ LE,
+ LT,
+};
+
+bool CompareAgainstZero(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, ComparisonType type) {
+ if (size == 0b11 && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 zero = v.ir.ZeroVector();
+ IR::U128 result = [&] {
+ switch (type) {
+ case ComparisonType::EQ:
+ return v.ir.VectorEqual(esize, operand, zero);
+ case ComparisonType::GE:
+ return v.ir.VectorGreaterEqualSigned(esize, operand, zero);
+ case ComparisonType::GT:
+ return v.ir.VectorGreaterSigned(esize, operand, zero);
+ case ComparisonType::LE:
+ return v.ir.VectorLessEqualSigned(esize, operand, zero);
+ case ComparisonType::LT:
+ default:
+ return v.ir.VectorLessSigned(esize, operand, zero);
+ }
+ }();
+
+ if (datasize == 64) {
+ result = v.ir.VectorZeroUpper(result);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPCompareAgainstZero(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, ComparisonType type) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 zero = v.ir.ZeroVector();
+ const IR::U128 result = [&] {
+ switch (type) {
+ case ComparisonType::EQ:
+ return v.ir.FPVectorEqual(esize, operand, zero);
+ case ComparisonType::GE:
+ return v.ir.FPVectorGreaterEqual(esize, operand, zero);
+ case ComparisonType::GT:
+ return v.ir.FPVectorGreater(esize, operand, zero);
+ case ComparisonType::LE:
+ return v.ir.FPVectorGreaterEqual(esize, zero, operand);
+ case ComparisonType::LT:
+ return v.ir.FPVectorGreater(esize, zero, operand);
+ }
+
+ UNREACHABLE();
+ }();
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool IntegerConvertToFloat(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, Signedness signedness) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+ const FP::RoundingMode rounding_mode = v.ir.current_location->FPCR().RMode();
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 result = signedness == Signedness::Signed
+ ? v.ir.FPVectorFromSignedFixed(esize, operand, 0, rounding_mode)
+ : v.ir.FPVectorFromUnsignedFixed(esize, operand, 0, rounding_mode);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FloatConvertToInteger(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, Signedness signedness, FP::RoundingMode rounding_mode) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 result = signedness == Signedness::Signed
+ ? v.ir.FPVectorToSignedFixed(esize, operand, 0, rounding_mode)
+ : v.ir.FPVectorToUnsignedFixed(esize, operand, 0, rounding_mode);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FloatRoundToIntegral(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) {
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 result = v.ir.FPVectorRoundInt(esize, operand, rounding_mode, exact);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FloatRoundToIntegralHalfPrecision(TranslatorVisitor& v, bool Q, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ const IR::U128 result = v.ir.FPVectorRoundInt(esize, operand, rounding_mode, exact);
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool SaturatedNarrow(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, IR::U128 (IR::IREmitter::*fn)(size_t, const IR::U128&)) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = 64;
+ const size_t part = Q ? 1 : 0;
+
+ const IR::U128 operand = v.V(2 * datasize, Vn);
+ const IR::U128 result = (v.ir.*fn)(2 * esize, operand);
+
+ v.Vpart(datasize, Vd, part, result);
+ return true;
+}
+
+enum class PairedAddLongExtraBehavior {
+ None,
+ Accumulate,
+};
+
+bool PairedAddLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, Signedness sign, PairedAddLongExtraBehavior behavior) {
+ if (size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = v.V(datasize, Vn);
+ IR::U128 result = [&] {
+ if (sign == Signedness::Signed) {
+ return v.ir.VectorPairedAddSignedWiden(esize, operand);
+ }
+
+ return v.ir.VectorPairedAddUnsignedWiden(esize, operand);
+ }();
+
+ if (behavior == PairedAddLongExtraBehavior::Accumulate) {
+ result = v.ir.VectorAdd(esize * 2, v.V(datasize, Vd), result);
+ }
+
+ if (datasize == 64) {
+ result = v.ir.VectorZeroUpper(result);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+} // Anonymous namespace
+
+bool TranslatorVisitor::CLS_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 shifted = ir.VectorArithmeticShiftRight(esize, operand, static_cast<u8>(esize));
+ const IR::U128 xored = ir.VectorEor(operand, shifted);
+ const IR::U128 clz = ir.VectorCountLeadingZeros(esize, xored);
+ IR::U128 result = ir.VectorSub(esize, clz, ir.VectorBroadcast(esize, I(esize, 1)));
+
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CLZ_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ IR::U128 result = ir.VectorCountLeadingZeros(esize, operand);
+
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CNT(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size != 0b00) {
+ return ReservedValue();
+ }
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorPopulationCount(operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::GE);
+}
+
+bool TranslatorVisitor::CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::GT);
+}
+
+bool TranslatorVisitor::CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::EQ);
+}
+
+bool TranslatorVisitor::CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::LE);
+}
+
+bool TranslatorVisitor::CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::LT);
+}
+
+bool TranslatorVisitor::ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (!Q && size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 data = V(datasize, Vn);
+ const IR::U128 result = ir.VectorAbs(esize, data);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::XTN(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = 64;
+ const size_t part = Q ? 1 : 0;
+
+ const IR::U128 operand = V(2 * datasize, Vn);
+ const IR::U128 result = ir.VectorNarrow(2 * esize, operand);
+
+ Vpart(datasize, Vd, part, result);
+ return true;
+}
+
+bool TranslatorVisitor::FABS_1(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorAbs(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FABS_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorAbs(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_zero_3(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 zero = ir.ZeroVector();
+ const IR::U128 result = ir.FPVectorEqual(16, operand, zero);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCMEQ_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::EQ);
+}
+
+bool TranslatorVisitor::FCMGE_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::GE);
+}
+
+bool TranslatorVisitor::FCMGT_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::GT);
+}
+
+bool TranslatorVisitor::FCMLE_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::LE);
+}
+
+bool TranslatorVisitor::FCMLT_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::LT);
+}
+
+bool TranslatorVisitor::FCVTL(bool Q, bool sz, Vec Vn, Vec Vd) {
+ const size_t esize = sz ? 32 : 16;
+ const size_t datasize = 64;
+ const size_t num_elements = datasize / esize;
+
+ const IR::U128 part = Vpart(64, Vn, Q);
+ const auto rounding_mode = ir.current_location->FPCR().RMode();
+ IR::U128 result = ir.ZeroVector();
+
+ for (size_t i = 0; i < num_elements; i++) {
+ IR::U16U32U64 element = ir.VectorGetElement(esize, part, i);
+
+ if (esize == 16) {
+ element = ir.FPHalfToSingle(element, rounding_mode);
+ } else if (esize == 32) {
+ element = ir.FPSingleToDouble(element, rounding_mode);
+ }
+
+ result = ir.VectorSetElement(2 * esize, result, i, element);
+ }
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTN(bool Q, bool sz, Vec Vn, Vec Vd) {
+ const size_t datasize = 64;
+ const size_t esize = sz ? 32 : 16;
+ const size_t num_elements = datasize / esize;
+
+ const IR::U128 operand = V(128, Vn);
+ const auto rounding_mode = ir.current_location->FPCR().RMode();
+ IR::U128 result = ir.ZeroVector();
+
+ for (size_t i = 0; i < num_elements; i++) {
+ IR::U16U32U64 element = ir.VectorGetElement(2 * esize, operand, i);
+
+ if (esize == 16) {
+ element = ir.FPSingleToHalf(element, rounding_mode);
+ } else if (esize == 32) {
+ element = ir.FPDoubleToSingle(element, rounding_mode);
+ }
+
+ result = ir.VectorSetElement(esize, result, i, element);
+ }
+
+ Vpart(datasize, Vd, Q, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTNS_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::ToNearest_TieEven);
+}
+
+bool TranslatorVisitor::FCVTMS_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsMinusInfinity);
+}
+
+bool TranslatorVisitor::FCVTAS_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+
+bool TranslatorVisitor::FCVTPS_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsPlusInfinity);
+}
+
+bool TranslatorVisitor::FCVTXN_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (!sz) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t part = Q ? 1 : 0;
+ const auto operand = ir.GetQ(Vn);
+ auto result = ir.ZeroVector();
+
+ for (size_t e = 0; e < 2; ++e) {
+ const IR::U64 element = ir.VectorGetElement(64, operand, e);
+ const IR::U32 converted = ir.FPDoubleToSingle(element, FP::RoundingMode::ToOdd);
+
+ result = ir.VectorSetElement(32, result, e, converted);
+ }
+
+ Vpart(64, Vd, part, result);
+ return true;
+}
+
+bool TranslatorVisitor::FCVTZS_int_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FCVTNU_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::ToNearest_TieEven);
+}
+
+bool TranslatorVisitor::FCVTMU_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsMinusInfinity);
+}
+
+bool TranslatorVisitor::FCVTAU_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::ToNearest_TieAwayFromZero);
+}
+
+bool TranslatorVisitor::FCVTPU_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsPlusInfinity);
+}
+
+bool TranslatorVisitor::FCVTZU_int_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsZero);
+}
+
+bool TranslatorVisitor::FRINTN_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false);
+}
+
+bool TranslatorVisitor::FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false);
+}
+
+bool TranslatorVisitor::FRINTM_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTP_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false);
+}
+
+bool TranslatorVisitor::FRINTZ_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsZero, false);
+}
+
+bool TranslatorVisitor::FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsZero, false);
+}
+
+bool TranslatorVisitor::FRINTA_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false);
+}
+
+bool TranslatorVisitor::FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false);
+}
+
+bool TranslatorVisitor::FRINTX_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, ir.current_location->FPCR().RMode(), true);
+}
+
+bool TranslatorVisitor::FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, ir.current_location->FPCR().RMode(), true);
+}
+
+bool TranslatorVisitor::FRINTI_1(bool Q, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, ir.current_location->FPCR().RMode(), false);
+}
+
+bool TranslatorVisitor::FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, ir.current_location->FPCR().RMode(), false);
+}
+
+bool TranslatorVisitor::FRECPE_3(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorRecipEstimate(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRECPE_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorRecipEstimate(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorSqrt(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTE_3(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 16;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorRSqrtEstimate(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FRSQRTE_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.FPVectorRSqrtEstimate(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNEG_1(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 mask = ir.VectorBroadcast(64, I(64, 0x8000800080008000));
+ const IR::U128 result = ir.VectorEor(operand, mask);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = sz ? 64 : 32;
+ const size_t mask_value = esize == 64 ? 0x8000000000000000 : 0x8000000080000000;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 mask = Q ? ir.VectorBroadcast(esize, I(esize, mask_value)) : ir.VectorBroadcastLower(esize, I(esize, mask_value));
+ const IR::U128 result = ir.VectorEor(operand, mask);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+ const size_t esize = 8 << size.ZeroExtend<size_t>();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 zero = ir.ZeroVector();
+ const IR::U128 result = ir.VectorSub(esize, zero, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorSignedSaturatedNarrowToUnsigned);
+}
+
+bool TranslatorVisitor::SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorSignedSaturatedNarrowToSigned);
+}
+
+bool TranslatorVisitor::UQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorUnsignedSaturatedNarrow);
+}
+
+bool TranslatorVisitor::NOT(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ IR::U128 result = ir.VectorNot(operand);
+
+ if (datasize == 64) {
+ result = ir.VectorZeroUpper(result);
+ }
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::RBIT_asimd(bool Q, Vec Vn, Vec Vd) {
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 data = V(datasize, Vn);
+ const IR::U128 result = ir.VectorReverseBits(data);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::REV16_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size > 0) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ constexpr size_t esize = 8;
+
+ const IR::U128 data = V(datasize, Vn);
+ const IR::U128 result = ir.VectorReverseElementsInHalfGroups(esize, data);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::REV32_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size > 1) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 data = V(datasize, Vn);
+ const IR::U128 result = ir.VectorReverseElementsInWordGroups(esize, data);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::REV64_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size > 2) {
+ return UnallocatedEncoding();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 data = V(datasize, Vn);
+ const IR::U128 result = ir.VectorReverseElementsInLongGroups(esize, data);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorSignedSaturatedAbs(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorSignedSaturatedNeg(esize, operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vd);
+ const IR::U128 result = ir.VectorSignedSaturatedAccumulateUnsigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11 && !Q) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vd);
+ const IR::U128 result = ir.VectorUnsignedSaturatedAccumulateSigned(esize, operand1, operand2);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Signed, PairedAddLongExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::SADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Signed, PairedAddLongExtraBehavior::None);
+}
+
+bool TranslatorVisitor::UADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Unsigned, PairedAddLongExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::UADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Unsigned, PairedAddLongExtraBehavior::None);
+}
+
+bool TranslatorVisitor::URECPE(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorUnsignedRecipEstimate(operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd) {
+ if (sz) {
+ return ReservedValue();
+ }
+
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand = V(datasize, Vn);
+ const IR::U128 result = ir.VectorUnsignedRecipSqrtEstimate(operand);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return IntegerConvertToFloat(*this, Q, sz, Vn, Vd, Signedness::Signed);
+}
+
+bool TranslatorVisitor::UCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd) {
+ return IntegerConvertToFloat(*this, Q, sz, Vn, Vd, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::SHLL(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
+ if (size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8 << size.ZeroExtend();
+
+ const IR::U128 operand = ir.VectorZeroExtend(esize, Vpart(64, Vn, Q));
+ const IR::U128 result = ir.VectorLogicalShiftLeft(esize * 2, operand, static_cast<u8>(esize));
+
+ V(128, Vd, result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
new file mode 100644
index 0000000000..07234fc61b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
@@ -0,0 +1,408 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <utility>
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+namespace {
+std::pair<size_t, Vec> Combine(Imm<2> size, Imm<1> H, Imm<1> L, Imm<1> M, Imm<4> Vmlo) {
+ if (size == 0b01) {
+ return {concatenate(H, L, M).ZeroExtend(), Vmlo.ZeroExtend<Vec>()};
+ }
+
+ return {concatenate(H, L).ZeroExtend(), concatenate(M, Vmlo).ZeroExtend<Vec>()};
+}
+
+enum class ExtraBehavior {
+ None,
+ Extended,
+ Accumulate,
+ Subtract,
+};
+
+bool MultiplyByElement(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
+ if (size != 0b01 && size != 0b10) {
+ return v.ReservedValue();
+ }
+
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+ const size_t idxdsize = H == 1 ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index);
+ const IR::U128 operand3 = v.V(datasize, Vd);
+
+ IR::U128 result = v.ir.VectorMultiply(esize, operand1, operand2);
+ if (extra_behavior == ExtraBehavior::Accumulate) {
+ result = v.ir.VectorAdd(esize, operand3, result);
+ } else if (extra_behavior == ExtraBehavior::Subtract) {
+ result = v.ir.VectorSub(esize, operand3, result);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPMultiplyByElement(TranslatorVisitor& v, bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
+ if (sz && L == 1) {
+ return v.ReservedValue();
+ }
+ if (sz && !Q) {
+ return v.ReservedValue();
+ }
+
+ const size_t idxdsize = H == 1 ? 128 : 64;
+ const size_t index = sz ? H.ZeroExtend() : concatenate(H, L).ZeroExtend();
+ const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
+ const size_t esize = sz ? 64 : 32;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index);
+ const IR::U128 operand3 = v.V(datasize, Vd);
+
+ const IR::U128 result = [&] {
+ switch (extra_behavior) {
+ case ExtraBehavior::None:
+ return v.ir.FPVectorMul(esize, operand1, operand2);
+ case ExtraBehavior::Extended:
+ return v.ir.FPVectorMulX(esize, operand1, operand2);
+ case ExtraBehavior::Accumulate:
+ return v.ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
+ case ExtraBehavior::Subtract:
+ return v.ir.FPVectorMulAdd(esize, operand3, v.ir.FPVectorNeg(esize, operand1), operand2);
+ }
+ UNREACHABLE();
+ }();
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+bool FPMultiplyByElementHalfPrecision(TranslatorVisitor& v, bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) {
+ const size_t idxdsize = H == 1 ? 128 : 64;
+ const size_t index = concatenate(H, L, M).ZeroExtend();
+ const Vec Vm = Vmlo.ZeroExtend<Vec>();
+ const size_t esize = 16;
+ const size_t datasize = Q ? 128 : 64;
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index);
+ const IR::U128 operand3 = v.V(datasize, Vd);
+
+ // TODO: We currently don't implement half-precision paths for
+ // regular multiplies and extended multiplies.
+ const IR::U128 result = [&] {
+ switch (extra_behavior) {
+ case ExtraBehavior::None:
+ break;
+ case ExtraBehavior::Extended:
+ break;
+ case ExtraBehavior::Accumulate:
+ return v.ir.FPVectorMulAdd(esize, operand3, operand1, operand2);
+ case ExtraBehavior::Subtract:
+ return v.ir.FPVectorMulAdd(esize, operand3, v.ir.FPVectorNeg(esize, operand1), operand2);
+ }
+ UNREACHABLE();
+ }();
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&);
+
+bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtensionFunction extension) {
+ if (size != 0b10) {
+ return v.ReservedValue();
+ }
+
+ const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const size_t elements = datasize / esize;
+ const size_t index = concatenate(H, L).ZeroExtend();
+
+ const IR::U128 operand1 = v.V(datasize, Vn);
+ const IR::U128 operand2 = v.V(128, Vm);
+ IR::U128 result = v.V(datasize, Vd);
+
+ for (size_t i = 0; i < elements; i++) {
+ IR::U32 res_element = v.ir.Imm32(0);
+
+ for (size_t j = 0; j < 4; j++) {
+ const IR::U32 elem1 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand1, 4 * i + j));
+ const IR::U32 elem2 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand2, 4 * index + j));
+
+ res_element = v.ir.Add(res_element, v.ir.Mul(elem1, elem2));
+ }
+
+ res_element = v.ir.Add(v.ir.VectorGetElement(32, result, i), res_element);
+ result = v.ir.VectorSetElement(32, result, i, res_element);
+ }
+
+ v.V(datasize, Vd, result);
+ return true;
+}
+
+enum class Signedness {
+ Signed,
+ Unsigned
+};
+
+bool MultiplyLong(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior, Signedness sign) {
+ if (size == 0b00 || size == 0b11) {
+ return v.ReservedValue();
+ }
+
+ const size_t idxsize = H == 1 ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = v.Vpart(datasize, Vn, Q);
+ const IR::U128 operand2 = v.V(idxsize, Vm);
+ const IR::U128 index_vector = v.ir.VectorBroadcastElement(esize, operand2, index);
+
+ const IR::U128 result = [&] {
+ const IR::U128 product = sign == Signedness::Signed
+ ? v.ir.VectorMultiplySignedWiden(esize, operand1, index_vector)
+ : v.ir.VectorMultiplyUnsignedWiden(esize, operand1, index_vector);
+
+ if (extra_behavior == ExtraBehavior::None) {
+ return product;
+ }
+
+ const IR::U128 operand3 = v.V(2 * datasize, Vd);
+ if (extra_behavior == ExtraBehavior::Accumulate) {
+ return v.ir.VectorAdd(2 * esize, operand3, product);
+ }
+
+ return v.ir.VectorSub(2 * esize, operand3, product);
+ }();
+
+ v.V(2 * datasize, Vd, result);
+ return true;
+}
+} // Anonymous namespace
+
+bool TranslatorVisitor::MLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::MLS_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
+}
+
+bool TranslatorVisitor::MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None);
+}
+
+bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ if (size == 0b01 && H == 1 && Q == 0) {
+ return ReservedValue();
+ }
+
+ if (size == 0b10 && (L == 1 || Q == 0)) {
+ return ReservedValue();
+ }
+
+ const size_t esize = 8U << size.ZeroExtend();
+
+ // TODO: We don't support the half-precision floating point variant yet.
+ if (esize == 16) {
+ return InterpretThisInstruction();
+ }
+
+ const size_t index = [=] {
+ if (size == 0b01) {
+ return concatenate(H, L).ZeroExtend();
+ }
+ return H.ZeroExtend();
+ }();
+
+ const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
+
+ const size_t datasize = Q ? 128 : 64;
+ const size_t num_elements = datasize / esize;
+ const size_t num_iterations = num_elements / 2;
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(datasize, Vm);
+ const IR::U128 operand3 = V(datasize, Vd);
+ IR::U128 result = ir.ZeroVector();
+
+ IR::U32U64 element1;
+ IR::U32U64 element2;
+ IR::U32U64 element3;
+ IR::U32U64 element4;
+ for (size_t e = 0; e < num_iterations; ++e) {
+ const size_t first = e * 2;
+ const size_t second = first + 1;
+
+ const size_t index_first = index * 2;
+ const size_t index_second = index_first + 1;
+
+ switch (rot.ZeroExtend()) {
+ case 0b00: // 0 degrees
+ element1 = ir.VectorGetElement(esize, operand2, index_first);
+ element2 = ir.VectorGetElement(esize, operand1, first);
+ element3 = ir.VectorGetElement(esize, operand2, index_second);
+ element4 = ir.VectorGetElement(esize, operand1, first);
+ break;
+ case 0b01: // 90 degrees
+ element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
+ element2 = ir.VectorGetElement(esize, operand1, second);
+ element3 = ir.VectorGetElement(esize, operand2, index_first);
+ element4 = ir.VectorGetElement(esize, operand1, second);
+ break;
+ case 0b10: // 180 degrees
+ element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
+ element2 = ir.VectorGetElement(esize, operand1, first);
+ element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
+ element4 = ir.VectorGetElement(esize, operand1, first);
+ break;
+ case 0b11: // 270 degrees
+ element1 = ir.VectorGetElement(esize, operand2, index_second);
+ element2 = ir.VectorGetElement(esize, operand1, second);
+ element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
+ element4 = ir.VectorGetElement(esize, operand1, second);
+ break;
+ }
+
+ const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first);
+ const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second);
+
+ result = ir.VectorSetElement(esize, result, first, ir.FPMulAdd(operand3_elem1, element2, element1));
+ result = ir.VectorSetElement(esize, result, second, ir.FPMulAdd(operand3_elem2, element4, element3));
+ }
+
+ ir.SetQ(Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::FMLA_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
+}
+
+bool TranslatorVisitor::FMLS_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
+}
+
+bool TranslatorVisitor::FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract);
+}
+
+bool TranslatorVisitor::FMUL_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None);
+}
+
+bool TranslatorVisitor::FMULX_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Extended);
+}
+
+bool TranslatorVisitor::SMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None, Signedness::Signed);
+}
+
+bool TranslatorVisitor::SQDMULL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t part = Q ? 1 : 0;
+ const size_t idxsize = H == 1 ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = 64;
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = Vpart(datasize, Vn, part);
+ const IR::U128 operand2 = V(idxsize, Vm);
+ const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, index_vector);
+
+ V(128, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t idxsize = H == 1 ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(idxsize, Vm);
+ const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, operand1, index_vector);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SQRDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ if (size == 0b00 || size == 0b11) {
+ return ReservedValue();
+ }
+
+ const size_t idxsize = H == 1 ? 128 : 64;
+ const size_t esize = 8 << size.ZeroExtend();
+ const size_t datasize = Q ? 128 : 64;
+ const auto [index, Vm] = Combine(size, H, L, M, Vmlo);
+
+ const IR::U128 operand1 = V(datasize, Vn);
+ const IR::U128 operand2 = V(idxsize, Vm);
+ const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
+ const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, index_vector);
+
+ V(datasize, Vd, result);
+ return true;
+}
+
+bool TranslatorVisitor::SDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return DotProduct(*this, Q, size, L, M, Vmlo, H, Vn, Vd, &IREmitter::SignExtendToWord);
+}
+
+bool TranslatorVisitor::UDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return DotProduct(*this, Q, size, L, M, Vmlo, H, Vn, Vd, &IREmitter::ZeroExtendToWord);
+}
+
+bool TranslatorVisitor::UMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract, Signedness::Unsigned);
+}
+
+bool TranslatorVisitor::UMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
+ return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None, Signedness::Unsigned);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp
new file mode 100644
index 0000000000..aea3b5baf6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp
@@ -0,0 +1,51 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+static bool DataCacheInstruction(TranslatorVisitor& v, DataCacheOperation op, const Reg Rt) {
+ v.ir.DataCacheOperationRaised(op, v.X(64, Rt));
+ return true;
+}
+
+bool TranslatorVisitor::DC_IVAC(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::InvalidateByVAToPoC, Rt);
+}
+
+bool TranslatorVisitor::DC_ISW(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::InvalidateBySetWay, Rt);
+}
+
+bool TranslatorVisitor::DC_CSW(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanBySetWay, Rt);
+}
+
+bool TranslatorVisitor::DC_CISW(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanAndInvalidateBySetWay, Rt);
+}
+
+bool TranslatorVisitor::DC_ZVA(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::ZeroByVA, Rt);
+}
+
+bool TranslatorVisitor::DC_CVAC(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoC, Rt);
+}
+
+bool TranslatorVisitor::DC_CVAU(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoU, Rt);
+}
+
+bool TranslatorVisitor::DC_CVAP(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoP, Rt);
+}
+
+bool TranslatorVisitor::DC_CIVAC(Reg Rt) {
+ return DataCacheInstruction(*this, DataCacheOperation::CleanAndInvalidateByVAToPoC, Rt);
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp
new file mode 100644
index 0000000000..3c91168cab
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp
@@ -0,0 +1,31 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::IC_IALLU() {
+ ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateAllToPoU, ir.Imm64(0));
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}});
+ return false;
+}
+
+bool TranslatorVisitor::IC_IALLUIS() {
+ ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateAllToPoUInnerSharable, ir.Imm64(0));
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}});
+ return false;
+}
+
+bool TranslatorVisitor::IC_IVAU(Reg Rt) {
+ ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateByVAToPoU, X(64, Rt));
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}});
+ return false;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp
new file mode 100644
index 0000000000..9c35a8f364
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp
@@ -0,0 +1,161 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+// Register encodings used by MRS and MSR.
+// Order of fields: op0, CRn, op1, op2, CRm.
+enum class SystemRegisterEncoding : u32 {
+ // Counter-timer Frequency register
+ CNTFRQ_EL0 = 0b11'1110'011'000'0000,
+ // Counter-timer Physical Count register
+ CNTPCT_EL0 = 0b11'1110'011'001'0000,
+ // Cache Type Register
+ CTR_EL0 = 0b11'0000'011'001'0000,
+ // Data Cache Zero ID register
+ DCZID_EL0 = 0b11'0000'011'111'0000,
+ // Floating-point Control Register
+ FPCR = 0b11'0100'011'000'0100,
+ // Floating-point Status Register
+ FPSR = 0b11'0100'011'001'0100,
+ // NZCV, Condition Flags
+ NZCV = 0b11'0100'011'000'0010,
+ // Read/Write Software Thread ID Register
+ TPIDR_EL0 = 0b11'1101'011'010'0000,
+ // Read-Only Software Thread ID Register
+ TPIDRRO_EL0 = 0b11'1101'011'011'0000,
+};
+
+bool TranslatorVisitor::HINT([[maybe_unused]] Imm<4> CRm, [[maybe_unused]] Imm<3> op2) {
+ return true;
+}
+
+bool TranslatorVisitor::NOP() {
+ return true;
+}
+
+bool TranslatorVisitor::YIELD() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::Yield);
+}
+
+bool TranslatorVisitor::WFE() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::WaitForEvent);
+}
+
+bool TranslatorVisitor::WFI() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::WaitForInterrupt);
+}
+
+bool TranslatorVisitor::SEV() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::SendEvent);
+}
+
+bool TranslatorVisitor::SEVL() {
+ if (!options.hook_hint_instructions) {
+ return true;
+ }
+ return RaiseException(Exception::SendEventLocal);
+}
+
+bool TranslatorVisitor::CLREX(Imm<4> /*CRm*/) {
+ ir.ClearExclusive();
+ return true;
+}
+
+bool TranslatorVisitor::DSB(Imm<4> /*CRm*/) {
+ ir.DataSynchronizationBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::DMB(Imm<4> /*CRm*/) {
+ ir.DataMemoryBarrier();
+ return true;
+}
+
+bool TranslatorVisitor::ISB(Imm<4> /*CRm*/) {
+ ir.InstructionSynchronizationBarrier();
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.SetTerm(IR::Term::ReturnToDispatch{});
+ return false;
+}
+
+bool TranslatorVisitor::MSR_reg(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt) {
+ const auto sys_reg = concatenate(Imm<1>{1}, o0, CRn, op1, op2, CRm).ZeroExtend<SystemRegisterEncoding>();
+ switch (sys_reg) {
+ case SystemRegisterEncoding::FPCR:
+ ir.SetFPCR(X(32, Rt));
+ ir.SetPC(ir.Imm64(ir.current_location->PC() + 4));
+ ir.SetTerm(IR::Term::FastDispatchHint{});
+ return false;
+ case SystemRegisterEncoding::FPSR:
+ ir.SetFPSR(X(32, Rt));
+ return true;
+ case SystemRegisterEncoding::NZCV:
+ ir.SetNZCVRaw(X(32, Rt));
+ return true;
+ case SystemRegisterEncoding::TPIDR_EL0:
+ ir.SetTPIDR(X(64, Rt));
+ return true;
+ default:
+ break;
+ }
+ return InterpretThisInstruction();
+}
+
+bool TranslatorVisitor::MRS(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt) {
+ const auto sys_reg = concatenate(Imm<1>{1}, o0, CRn, op1, op2, CRm).ZeroExtend<SystemRegisterEncoding>();
+ switch (sys_reg) {
+ case SystemRegisterEncoding::CNTFRQ_EL0:
+ X(32, Rt, ir.GetCNTFRQ());
+ return true;
+ case SystemRegisterEncoding::CNTPCT_EL0:
+ // HACK: Ensure that this is the first instruction in the block it's emitted in, so the cycle count is most up-to-date.
+ if (!ir.block.empty() && !options.wall_clock_cntpct) {
+ ir.block.CycleCount()--;
+ ir.SetTerm(IR::Term::LinkBlock{*ir.current_location});
+ return false;
+ }
+ X(64, Rt, ir.GetCNTPCT());
+ return true;
+ case SystemRegisterEncoding::CTR_EL0:
+ X(32, Rt, ir.GetCTR());
+ return true;
+ case SystemRegisterEncoding::DCZID_EL0:
+ X(32, Rt, ir.GetDCZID());
+ return true;
+ case SystemRegisterEncoding::FPCR:
+ X(32, Rt, ir.GetFPCR());
+ return true;
+ case SystemRegisterEncoding::FPSR:
+ X(32, Rt, ir.GetFPSR());
+ return true;
+ case SystemRegisterEncoding::NZCV:
+ X(32, Rt, ir.GetNZCVRaw());
+ return true;
+ case SystemRegisterEncoding::TPIDR_EL0:
+ X(64, Rt, ir.GetTPIDR());
+ return true;
+ case SystemRegisterEncoding::TPIDRRO_EL0:
+ X(64, Rt, ir.GetTPIDRRO());
+ return true;
+ }
+ return InterpretThisInstruction();
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp
new file mode 100644
index 0000000000..cbed3c34ed
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp
@@ -0,0 +1,46 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::AXFlag() {
+ const IR::U32 nzcv = ir.GetNZCVRaw();
+
+ const IR::U32 z = ir.And(nzcv, ir.Imm32(0x40000000));
+ const IR::U32 c = ir.And(nzcv, ir.Imm32(0x20000000));
+ const IR::U32 v = ir.And(nzcv, ir.Imm32(0x10000000));
+
+ const IR::U32 new_z = ir.Or(ir.LogicalShiftLeft(v, ir.Imm8(2)), z);
+ const IR::U32 new_c = ir.And(ir.AndNot(c, ir.LogicalShiftLeft(v, ir.Imm8(1))), ir.Imm32(0x20000000));
+
+ ir.SetNZCVRaw(ir.Or(new_z, new_c));
+ return true;
+}
+
+bool TranslatorVisitor::XAFlag() {
+ const IR::U32 nzcv = ir.GetNZCVRaw();
+
+ const IR::U32 z = ir.And(nzcv, ir.Imm32(0x40000000));
+ const IR::U32 c = ir.And(nzcv, ir.Imm32(0x20000000));
+
+ const IR::U32 not_z = ir.AndNot(ir.Imm32(0x40000000), z);
+ const IR::U32 not_c = ir.AndNot(ir.Imm32(0x20000000), c);
+
+ const IR::U32 new_n = ir.And(ir.LogicalShiftLeft(not_c, ir.Imm8(2)),
+ ir.LogicalShiftLeft(not_z, ir.Imm8(1)));
+ const IR::U32 new_z = ir.And(z, ir.LogicalShiftLeft(c, ir.Imm8(1)));
+ const IR::U32 new_c = ir.Or(c, ir.LogicalShiftRight(z, ir.Imm8(1)));
+ const IR::U32 new_v = ir.And(ir.LogicalShiftRight(not_c, ir.Imm8(1)),
+ ir.LogicalShiftRight(z, ir.Imm8(2)));
+
+ const IR::U32 result = ir.Or(ir.Or(ir.Or(new_n, new_z), new_c), new_v);
+
+ ir.SetNZCVRaw(result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp
new file mode 100644
index 0000000000..65ec23e71c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp
@@ -0,0 +1,62 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/translate/impl/impl.h"
+
+namespace Dynarmic::A64 {
+
+bool TranslatorVisitor::CFINV() {
+ const IR::U32 nzcv = ir.GetNZCVRaw();
+ const IR::U32 result = ir.Eor(nzcv, ir.Imm32(0x20000000));
+
+ ir.SetNZCVRaw(result);
+ return true;
+}
+
+bool TranslatorVisitor::RMIF(Imm<6> lsb, Reg Rn, Imm<4> mask) {
+ const u32 mask_value = mask.ZeroExtend();
+
+ // If no bits are to be moved into the NZCV bits, then we
+ // just preserve the bits and do no extra work.
+ if (mask_value == 0) {
+ ir.SetNZCVRaw(ir.GetNZCVRaw());
+ return true;
+ }
+
+ const IR::U64 tmp_reg = ir.GetX(Rn);
+ const IR::U64 rotated = ir.RotateRight(tmp_reg, ir.Imm8(lsb.ZeroExtend<u8>()));
+ const IR::U32 shifted = ir.LeastSignificantWord(ir.LogicalShiftLeft(rotated, ir.Imm8(28)));
+
+ // On the other hand, if all mask bits are set, then we move all four
+ // relevant bits in the source register to the NZCV bits.
+ if (mask_value == 0b1111) {
+ ir.SetNZCVRaw(shifted);
+ return true;
+ }
+
+ // Determine which bits from the PSTATE will be preserved during the operation.
+ u32 preservation_mask = 0;
+ if ((mask_value & 0b1000) == 0) {
+ preservation_mask |= 1U << 31;
+ }
+ if ((mask_value & 0b0100) == 0) {
+ preservation_mask |= 1U << 30;
+ }
+ if ((mask_value & 0b0010) == 0) {
+ preservation_mask |= 1U << 29;
+ }
+ if ((mask_value & 0b0001) == 0) {
+ preservation_mask |= 1U << 28;
+ }
+
+ const IR::U32 masked = ir.And(shifted, ir.Imm32(~preservation_mask));
+ const IR::U32 nzcv = ir.And(ir.GetNZCVRaw(), ir.Imm32(preservation_mask));
+ const IR::U32 result = ir.Or(nzcv, masked);
+
+ ir.SetNZCVRaw(result);
+ return true;
+}
+
+} // namespace Dynarmic::A64
diff --git a/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h b/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h
new file mode 100644
index 0000000000..cf7d0e64bc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h
@@ -0,0 +1,190 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <tuple>
+
+#include <mcl/assert.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/type_traits/function_info.hpp>
+
+namespace Dynarmic::Decoder {
+namespace detail {
+
+template<size_t N>
+inline consteval std::array<char, N> StringToArray(const char (&str)[N + 1]) {
+ std::array<char, N> result{};
+ for (size_t i = 0; i < N; i++) {
+ result[i] = str[i];
+ }
+ return result;
+}
+
+/**
+ * Helper functions for the decoders.
+ *
+ * @tparam MatcherT The type of the Matcher to use.
+ */
+template<class MatcherT>
+struct detail {
+ using opcode_type = typename MatcherT::opcode_type;
+ using visitor_type = typename MatcherT::visitor_type;
+
+ static constexpr size_t opcode_bitsize = mcl::bitsizeof<opcode_type>;
+
+ /**
+ * Generates the mask and the expected value after masking from a given bitstring.
+ * A '0' in a bitstring indicates that a zero must be present at that bit position.
+ * A '1' in a bitstring indicates that a one must be present at that bit position.
+ */
+#ifdef __clang__
+ static constexpr auto GetMaskAndExpect(std::array<char, opcode_bitsize> bitstring) {
+#else
+ static consteval auto GetMaskAndExpect(std::array<char, opcode_bitsize> bitstring) {
+#endif
+ const auto one = static_cast<opcode_type>(1);
+ opcode_type mask = 0, expect = 0;
+ for (size_t i = 0; i < opcode_bitsize; i++) {
+ const size_t bit_position = opcode_bitsize - i - 1;
+ switch (bitstring[i]) {
+ case '0':
+ mask |= one << bit_position;
+ break;
+ case '1':
+ expect |= one << bit_position;
+ mask |= one << bit_position;
+ break;
+ default:
+ // Ignore
+ break;
+ }
+ }
+ return std::make_tuple(mask, expect);
+ }
+
+ /**
+ * Generates the masks and shifts for each argument.
+ * A '-' in a bitstring indicates that we don't care about that value.
+ * An argument is specified by a continuous string of the same character.
+ */
+ template<size_t N>
+ static consteval auto GetArgInfo(std::array<char, opcode_bitsize> bitstring) {
+ std::array<opcode_type, N> masks = {};
+ std::array<size_t, N> shifts = {};
+ size_t arg_index = 0;
+ char ch = 0;
+
+ for (size_t i = 0; i < opcode_bitsize; i++) {
+ if (bitstring[i] == '0' || bitstring[i] == '1' || bitstring[i] == '-') {
+ if (ch != 0) {
+ ch = 0;
+ arg_index++;
+ }
+ } else {
+ if (ch == 0) {
+ ch = bitstring[i];
+ } else if (ch != bitstring[i]) {
+ ch = bitstring[i];
+ arg_index++;
+ }
+
+ if constexpr (N > 0) {
+ const size_t bit_position = opcode_bitsize - i - 1;
+
+ if (arg_index >= N)
+ throw std::out_of_range("Unexpected field");
+
+ masks[arg_index] |= static_cast<opcode_type>(1) << bit_position;
+ shifts[arg_index] = bit_position;
+ } else {
+ throw std::out_of_range("Unexpected field");
+ }
+ }
+ }
+
+#if !defined(DYNARMIC_IGNORE_ASSERTS) && !defined(__ANDROID__)
+ // Avoids a MSVC ICE, and avoids Android NDK issue.
+ ASSERT(std::all_of(masks.begin(), masks.end(), [](auto m) { return m != 0; }));
+#endif
+
+ return std::make_tuple(masks, shifts);
+ }
+
+ /**
+ * This struct's Make member function generates a lambda which decodes an instruction based on
+ * the provided arg_masks and arg_shifts. The Visitor member function to call is provided as a
+ * template argument.
+ */
+ template<typename FnT>
+ struct VisitorCaller;
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4800) // forcing value to bool 'true' or 'false' (performance warning)
+#endif
+ template<typename Visitor, typename... Args, typename CallRetT>
+ struct VisitorCaller<CallRetT (Visitor::*)(Args...)> {
+ template<size_t... iota>
+ static auto Make(std::integer_sequence<size_t, iota...>,
+ CallRetT (Visitor::*const fn)(Args...),
+ const std::array<opcode_type, sizeof...(iota)> arg_masks,
+ const std::array<size_t, sizeof...(iota)> arg_shifts) {
+ static_assert(std::is_same_v<visitor_type, Visitor>, "Member function is not from Matcher's Visitor");
+ return [fn, arg_masks, arg_shifts](Visitor& v, opcode_type instruction) {
+ (void)instruction;
+ (void)arg_masks;
+ (void)arg_shifts;
+ return (v.*fn)(static_cast<Args>((instruction & arg_masks[iota]) >> arg_shifts[iota])...);
+ };
+ }
+ };
+
+ template<typename Visitor, typename... Args, typename CallRetT>
+ struct VisitorCaller<CallRetT (Visitor::*)(Args...) const> {
+ template<size_t... iota>
+ static auto Make(std::integer_sequence<size_t, iota...>,
+ CallRetT (Visitor::*const fn)(Args...) const,
+ const std::array<opcode_type, sizeof...(iota)> arg_masks,
+ const std::array<size_t, sizeof...(iota)> arg_shifts) {
+ static_assert(std::is_same_v<visitor_type, const Visitor>, "Member function is not from Matcher's Visitor");
+ return [fn, arg_masks, arg_shifts](const Visitor& v, opcode_type instruction) {
+ (void)instruction;
+ (void)arg_masks;
+ (void)arg_shifts;
+ return (v.*fn)(static_cast<Args>((instruction & arg_masks[iota]) >> arg_shifts[iota])...);
+ };
+ }
+ };
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+ /**
+ * Creates a matcher that can match and parse instructions based on bitstring.
+ * See also: GetMaskAndExpect and GetArgInfo for format of bitstring.
+ */
+ template<auto bitstring, typename FnT>
+ static auto GetMatcher(FnT fn, const char* const name) {
+ constexpr size_t args_count = mcl::parameter_count_v<FnT>;
+
+ constexpr auto mask = std::get<0>(GetMaskAndExpect(bitstring));
+ constexpr auto expect = std::get<1>(GetMaskAndExpect(bitstring));
+ constexpr auto arg_masks = std::get<0>(GetArgInfo<args_count>(bitstring));
+ constexpr auto arg_shifts = std::get<1>(GetArgInfo<args_count>(bitstring));
+
+ using Iota = std::make_index_sequence<args_count>;
+
+ const auto proxy_fn = VisitorCaller<FnT>::Make(Iota(), fn, arg_masks, arg_shifts);
+ return MatcherT(name, mask, expect, proxy_fn);
+ }
+};
+
+#define DYNARMIC_DECODER_GET_MATCHER(MatcherT, fn, name, bitstring) Decoder::detail::detail<MatcherT<V>>::template GetMatcher<bitstring>(&V::fn, name)
+
+} // namespace detail
+} // namespace Dynarmic::Decoder
diff --git a/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h b/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h
new file mode 100644
index 0000000000..adf9556dd4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h
@@ -0,0 +1,76 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+
+#include <mcl/assert.hpp>
+
+namespace Dynarmic::Decoder {
+
+/**
+ * Generic instruction handling construct.
+ *
+ * @tparam Visitor An arbitrary visitor type that will be passed through
+ * to the function being handled. This type must be the
+ * type of the first parameter in a handler function.
+ *
+ * @tparam OpcodeType Type representing an opcode. This must be the
+ * type of the second parameter in a handler function.
+ */
+template<typename Visitor, typename OpcodeType>
+class Matcher {
+public:
+ using opcode_type = OpcodeType;
+ using visitor_type = Visitor;
+ using handler_return_type = typename Visitor::instruction_return_type;
+ using handler_function = std::function<handler_return_type(Visitor&, opcode_type)>;
+
+ Matcher(const char* const name, opcode_type mask, opcode_type expected, handler_function func)
+ : name{name}, mask{mask}, expected{expected}, fn{std::move(func)} {}
+
+ /// Gets the name of this type of instruction.
+ const char* GetName() const {
+ return name;
+ }
+
+ /// Gets the mask for this instruction.
+ opcode_type GetMask() const {
+ return mask;
+ }
+
+ /// Gets the expected value after masking for this instruction.
+ opcode_type GetExpected() const {
+ return expected;
+ }
+
+ /**
+ * Tests to see if the given instruction is the instruction this matcher represents.
+ * @param instruction The instruction to test
+ * @returns true if the given instruction matches.
+ */
+ bool Matches(opcode_type instruction) const {
+ return (instruction & mask) == expected;
+ }
+
+ /**
+ * Calls the corresponding instruction handler on visitor for this type of instruction.
+ * @param v The visitor to use
+ * @param instruction The instruction to decode.
+ */
+ handler_return_type call(Visitor& v, opcode_type instruction) const {
+ ASSERT(Matches(instruction));
+ return fn(v, instruction);
+ }
+
+private:
+ const char* name;
+ opcode_type mask;
+ opcode_type expected;
+ handler_function fn;
+};
+
+} // namespace Dynarmic::Decoder
diff --git a/externals/dynarmic/src/dynarmic/frontend/imm.cpp b/externals/dynarmic/src/dynarmic/frontend/imm.cpp
new file mode 100644
index 0000000000..c802864df9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/imm.cpp
@@ -0,0 +1,67 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/imm.h"
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic {
+
+u64 AdvSIMDExpandImm(bool op, Imm<4> cmode, Imm<8> imm8) {
+ switch (cmode.Bits<1, 3>()) {
+ case 0b000:
+ return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>());
+ case 0b001:
+ return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 8);
+ case 0b010:
+ return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 16);
+ case 0b011:
+ return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 24);
+ case 0b100:
+ return mcl::bit::replicate_element<u16, u64>(imm8.ZeroExtend<u64>());
+ case 0b101:
+ return mcl::bit::replicate_element<u16, u64>(imm8.ZeroExtend<u64>() << 8);
+ case 0b110:
+ if (!cmode.Bit<0>()) {
+ return mcl::bit::replicate_element<u32, u64>((imm8.ZeroExtend<u64>() << 8) | mcl::bit::ones<u64>(8));
+ }
+ return mcl::bit::replicate_element<u32, u64>((imm8.ZeroExtend<u64>() << 16) | mcl::bit::ones<u64>(16));
+ case 0b111:
+ if (!cmode.Bit<0>() && !op) {
+ return mcl::bit::replicate_element<u8, u64>(imm8.ZeroExtend<u64>());
+ }
+ if (!cmode.Bit<0>() && op) {
+ u64 result = 0;
+ result |= imm8.Bit<0>() ? mcl::bit::ones<u64>(8) << (0 * 8) : 0;
+ result |= imm8.Bit<1>() ? mcl::bit::ones<u64>(8) << (1 * 8) : 0;
+ result |= imm8.Bit<2>() ? mcl::bit::ones<u64>(8) << (2 * 8) : 0;
+ result |= imm8.Bit<3>() ? mcl::bit::ones<u64>(8) << (3 * 8) : 0;
+ result |= imm8.Bit<4>() ? mcl::bit::ones<u64>(8) << (4 * 8) : 0;
+ result |= imm8.Bit<5>() ? mcl::bit::ones<u64>(8) << (5 * 8) : 0;
+ result |= imm8.Bit<6>() ? mcl::bit::ones<u64>(8) << (6 * 8) : 0;
+ result |= imm8.Bit<7>() ? mcl::bit::ones<u64>(8) << (7 * 8) : 0;
+ return result;
+ }
+ if (cmode.Bit<0>() && !op) {
+ u64 result = 0;
+ result |= imm8.Bit<7>() ? 0x80000000 : 0;
+ result |= imm8.Bit<6>() ? 0x3E000000 : 0x40000000;
+ result |= imm8.Bits<0, 5, u64>() << 19;
+ return mcl::bit::replicate_element<u32, u64>(result);
+ }
+ if (cmode.Bit<0>() && op) {
+ u64 result = 0;
+ result |= imm8.Bit<7>() ? 0x80000000'00000000 : 0;
+ result |= imm8.Bit<6>() ? 0x3FC00000'00000000 : 0x40000000'00000000;
+ result |= imm8.Bits<0, 5, u64>() << 48;
+ return result;
+ }
+ }
+ UNREACHABLE();
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/frontend/imm.h b/externals/dynarmic/src/dynarmic/frontend/imm.h
new file mode 100644
index 0000000000..7d86abbb61
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/frontend/imm.h
@@ -0,0 +1,168 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <type_traits>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+#include <mcl/bitsizeof.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/math_util.h"
+
+namespace Dynarmic {
+
+/**
+ * Imm represents an immediate value in an AArch32/AArch64 instruction.
+ * Imm is used during translation as a typesafe way of passing around immediates of fixed sizes.
+ */
+template<size_t bit_size_>
+class Imm {
+public:
+ static constexpr size_t bit_size = bit_size_;
+
+ explicit Imm(u32 value)
+ : value(value) {
+ ASSERT_MSG((mcl::bit::get_bits<0, bit_size - 1>(value) == value), "More bits in value than expected");
+ }
+
+ template<typename T = u32>
+ T ZeroExtend() const {
+ static_assert(mcl::bitsizeof<T> >= bit_size);
+ return static_cast<T>(value);
+ }
+
+ template<typename T = s32>
+ T SignExtend() const {
+ static_assert(mcl::bitsizeof<T> >= bit_size);
+ return static_cast<T>(mcl::bit::sign_extend<bit_size, std::make_unsigned_t<T>>(value));
+ }
+
+ template<size_t bit>
+ bool Bit() const {
+ static_assert(bit < bit_size);
+ return mcl::bit::get_bit<bit>(value);
+ }
+
+ template<size_t begin_bit, size_t end_bit, typename T = u32>
+ T Bits() const {
+ static_assert(begin_bit <= end_bit && end_bit < bit_size);
+ static_assert(mcl::bitsizeof<T> >= end_bit - begin_bit + 1);
+ return static_cast<T>(mcl::bit::get_bits<begin_bit, end_bit>(value));
+ }
+
+ bool operator==(Imm other) const {
+ return value == other.value;
+ }
+
+ bool operator!=(Imm other) const {
+ return !operator==(other);
+ }
+
+ bool operator<(Imm other) const {
+ return value < other.value;
+ }
+
+ bool operator<=(Imm other) const {
+ return value <= other.value;
+ }
+
+ bool operator>(Imm other) const {
+ return value > other.value;
+ }
+
+ bool operator>=(Imm other) const {
+ return value >= other.value;
+ }
+
+private:
+ static_assert(bit_size != 0, "Cannot have a zero-sized immediate");
+ static_assert(bit_size <= 32, "Cannot have an immediate larger than the instruction size");
+
+ u32 value;
+};
+
+template<size_t bit_size>
+bool operator==(u32 a, Imm<bit_size> b) {
+ return Imm<bit_size>{a} == b;
+}
+
+template<size_t bit_size>
+bool operator==(Imm<bit_size> a, u32 b) {
+ return Imm<bit_size>{b} == a;
+}
+
+template<size_t bit_size>
+bool operator!=(u32 a, Imm<bit_size> b) {
+ return !operator==(a, b);
+}
+
+template<size_t bit_size>
+bool operator!=(Imm<bit_size> a, u32 b) {
+ return !operator==(a, b);
+}
+
+template<size_t bit_size>
+bool operator<(u32 a, Imm<bit_size> b) {
+ return Imm<bit_size>{a} < b;
+}
+
+template<size_t bit_size>
+bool operator<(Imm<bit_size> a, u32 b) {
+ return a < Imm<bit_size>{b};
+}
+
+template<size_t bit_size>
+bool operator<=(u32 a, Imm<bit_size> b) {
+ return !operator<(b, a);
+}
+
+template<size_t bit_size>
+bool operator<=(Imm<bit_size> a, u32 b) {
+ return !operator<(b, a);
+}
+
+template<size_t bit_size>
+bool operator>(u32 a, Imm<bit_size> b) {
+ return operator<(b, a);
+}
+
+template<size_t bit_size>
+bool operator>(Imm<bit_size> a, u32 b) {
+ return operator<(b, a);
+}
+
+template<size_t bit_size>
+bool operator>=(u32 a, Imm<bit_size> b) {
+ return !operator<(a, b);
+}
+
+template<size_t bit_size>
+bool operator>=(Imm<bit_size> a, u32 b) {
+ return !operator<(a, b);
+}
+
+/**
+ * Concatenate immediates together.
+ * Left to right corresponds to most significant imm to least significant imm.
+ * This is equivalent to a:b:...:z in ASL.
+ */
+template<size_t first_bit_size, size_t... rest_bit_sizes>
+auto concatenate(Imm<first_bit_size> first, Imm<rest_bit_sizes>... rest) {
+ if constexpr (sizeof...(rest) == 0) {
+ return first;
+ } else {
+ const auto concat_rest = concatenate(rest...);
+ const u32 value = (first.ZeroExtend() << concat_rest.bit_size) | concat_rest.ZeroExtend();
+ return Imm<Common::Sum(first_bit_size, rest_bit_sizes...)>{value};
+ }
+}
+
+/// Expands an Advanced SIMD modified immediate.
+u64 AdvSIMDExpandImm(bool op, Imm<4> cmode, Imm<8> imm8);
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/a32.h b/externals/dynarmic/src/dynarmic/interface/A32/a32.h
new file mode 100644
index 0000000000..e1c5b40fd3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/a32.h
@@ -0,0 +1,109 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/interface/halt_reason.h"
+
+namespace Dynarmic {
+namespace A32 {
+
+class Jit final {
+public:
+ explicit Jit(UserConfig conf);
+ ~Jit();
+
+ /**
+ * Runs the emulated CPU.
+ * Cannot be recursively called.
+ */
+ HaltReason Run();
+
+ /**
+ * Steps the emulated CPU.
+ * Cannot be recursively called.
+ */
+ HaltReason Step();
+
+ /**
+ * Clears the code cache of all compiled code.
+ * Can be called at any time. Halts execution if called within a callback.
+ */
+ void ClearCache();
+
+ /**
+ * Invalidate the code cache at a range of addresses.
+ * @param start_address The starting address of the range to invalidate.
+ * @param length The length (in bytes) of the range to invalidate.
+ */
+ void InvalidateCacheRange(std::uint32_t start_address, std::size_t length);
+
+ /**
+ * Reset CPU state to state at startup. Does not clear code cache.
+ * Cannot be called from a callback.
+ */
+ void Reset();
+
+ /**
+ * Stops execution in Jit::Run.
+ */
+ void HaltExecution(HaltReason hr = HaltReason::UserDefined1);
+
+ /**
+ * Clears a halt reason from flags.
+ * Warning: Only use this if you're sure this won't introduce races.
+ */
+ void ClearHalt(HaltReason hr = HaltReason::UserDefined1);
+
+ /// View and modify registers.
+ std::array<std::uint32_t, 16>& Regs();
+ const std::array<std::uint32_t, 16>& Regs() const;
+ std::array<std::uint32_t, 64>& ExtRegs();
+ const std::array<std::uint32_t, 64>& ExtRegs() const;
+
+ /// View and modify CPSR.
+ std::uint32_t Cpsr() const;
+ void SetCpsr(std::uint32_t value);
+
+ /// View and modify FPSCR.
+ std::uint32_t Fpscr() const;
+ void SetFpscr(std::uint32_t value);
+
+ /// Clears exclusive state for this core.
+ void ClearExclusiveState();
+
+ /**
+ * Returns true if Jit::Run was called but hasn't returned yet.
+ * i.e.: We're in a callback.
+ */
+ bool IsExecuting() const {
+ return is_executing;
+ }
+
+ /// Debugging: Dump a disassembly all compiled code to the console.
+ void DumpDisassembly() const;
+
+ /**
+ * Disassemble the instructions following the current pc and return
+ * the resulting instructions as a vector of their string representations.
+ */
+ std::vector<std::string> Disassemble() const;
+
+private:
+ bool is_executing = false;
+
+ struct Impl;
+ std::unique_ptr<Impl> impl;
+};
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h b/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h
new file mode 100644
index 0000000000..240e40ee4c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h
@@ -0,0 +1,23 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic {
+namespace A32 {
+
+enum class ArchVersion {
+ v3,
+ v4,
+ v4T,
+ v5TE,
+ v6K,
+ v6T2,
+ v7,
+ v8,
+};
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/config.h b/externals/dynarmic/src/dynarmic/interface/A32/config.h
new file mode 100644
index 0000000000..a3c2aa159c
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/config.h
@@ -0,0 +1,246 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "dynarmic/frontend/A32/translate/translate_callbacks.h"
+#include "dynarmic/interface/A32/arch_version.h"
+#include "dynarmic/interface/optimization_flags.h"
+
+namespace Dynarmic {
+class ExclusiveMonitor;
+} // namespace Dynarmic
+
+namespace Dynarmic {
+namespace A32 {
+
+using VAddr = std::uint32_t;
+
+class Coprocessor;
+
+enum class Exception {
+ /// An UndefinedFault occured due to executing instruction with an unallocated encoding
+ UndefinedInstruction,
+ /// An unpredictable instruction is to be executed. Implementation-defined behaviour should now happen.
+ /// This behaviour is up to the user of this library to define.
+ UnpredictableInstruction,
+ /// A decode error occurred when decoding this instruction. This should never happen.
+ DecodeError,
+ /// A SEV instruction was executed. The event register of all PEs should be set. (Hint instruction.)
+ SendEvent,
+ /// A SEVL instruction was executed. The event register of the current PE should be set. (Hint instruction.)
+ SendEventLocal,
+ /// A WFI instruction was executed. You may now enter a low-power state. (Hint instruction.)
+ WaitForInterrupt,
+ /// A WFE instruction was executed. You may now enter a low-power state if the event register is clear. (Hint instruction.)
+ WaitForEvent,
+ /// A YIELD instruction was executed. (Hint instruction.)
+ Yield,
+ /// A BKPT instruction was executed.
+ Breakpoint,
+ /// A PLD instruction was executed. (Hint instruction.)
+ PreloadData,
+ /// A PLDW instruction was executed. (Hint instruction.)
+ PreloadDataWithIntentToWrite,
+ /// A PLI instruction was executed. (Hint instruction.)
+ PreloadInstruction,
+ /// Attempted to execute a code block at an address for which MemoryReadCode returned std::nullopt.
+ /// (Intended to be used to emulate memory protection faults.)
+ NoExecuteFault,
+};
+
+/// These function pointers may be inserted into compiled code.
+struct UserCallbacks : public TranslateCallbacks {
+ virtual ~UserCallbacks() = default;
+
+ // All reads through this callback are 4-byte aligned.
+ // Memory must be interpreted as little endian.
+ std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) override { return MemoryRead32(vaddr); }
+
+ // This function is called before the instruction at pc is read.
+ // IR code can be emitted by the callee prior to instruction handling.
+ // By returning true the callee precludes the translation of the instruction;
+ // in such case the callee is responsible for setting the terminal.
+ bool PreCodeReadHook(bool /*is_thumb*/, VAddr /*pc*/, A32::IREmitter& /*ir*/) override { return true; }
+
+ // Thus function is called before the instruction at pc is interpreted.
+ // IR code can be emitted by the callee prior to translation of the instruction.
+ void PreCodeTranslationHook(bool /*is_thumb*/, VAddr /*pc*/, A32::IREmitter& /*ir*/) override {}
+
+ // Reads through these callbacks may not be aligned.
+ // Memory must be interpreted as if ENDIANSTATE == 0, endianness will be corrected by the JIT.
+ virtual std::uint8_t MemoryRead8(VAddr vaddr) = 0;
+ virtual std::uint16_t MemoryRead16(VAddr vaddr) = 0;
+ virtual std::uint32_t MemoryRead32(VAddr vaddr) = 0;
+ virtual std::uint64_t MemoryRead64(VAddr vaddr) = 0;
+
+ // Writes through these callbacks may not be aligned.
+ virtual void MemoryWrite8(VAddr vaddr, std::uint8_t value) = 0;
+ virtual void MemoryWrite16(VAddr vaddr, std::uint16_t value) = 0;
+ virtual void MemoryWrite32(VAddr vaddr, std::uint32_t value) = 0;
+ virtual void MemoryWrite64(VAddr vaddr, std::uint64_t value) = 0;
+
+ // Writes through these callbacks may not be aligned.
+ virtual bool MemoryWriteExclusive8(VAddr /*vaddr*/, std::uint8_t /*value*/, std::uint8_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive16(VAddr /*vaddr*/, std::uint16_t /*value*/, std::uint16_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive32(VAddr /*vaddr*/, std::uint32_t /*value*/, std::uint32_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive64(VAddr /*vaddr*/, std::uint64_t /*value*/, std::uint64_t /*expected*/) { return false; }
+
+ // If this callback returns true, the JIT will assume MemoryRead* callbacks will always
+ // return the same value at any point in time for this vaddr. The JIT may use this information
+ // in optimizations.
+ // A conservative implementation that always returns false is safe.
+ virtual bool IsReadOnlyMemory(VAddr /*vaddr*/) { return false; }
+
+ /// The interpreter must execute exactly num_instructions starting from PC.
+ virtual void InterpreterFallback(VAddr pc, size_t num_instructions) = 0;
+
+ // This callback is called whenever a SVC instruction is executed.
+ virtual void CallSVC(std::uint32_t swi) = 0;
+
+ virtual void ExceptionRaised(VAddr pc, Exception exception) = 0;
+
+ virtual void InstructionSynchronizationBarrierRaised() {}
+
+ // Timing-related callbacks
+ // ticks ticks have passed
+ virtual void AddTicks(std::uint64_t ticks) = 0;
+ // How many more ticks am I allowed to execute?
+ virtual std::uint64_t GetTicksRemaining() = 0;
+ // How many ticks should this instruction take to execute?
+ std::uint64_t GetTicksForCode(bool /*is_thumb*/, VAddr /*vaddr*/, std::uint32_t /*instruction*/) override { return 1; }
+};
+
+struct UserConfig {
+ UserCallbacks* callbacks;
+
+ size_t processor_id = 0;
+ ExclusiveMonitor* global_monitor = nullptr;
+
+ /// Select the architecture version to use.
+ /// There are minor behavioural differences between versions.
+ ArchVersion arch_version = ArchVersion::v8;
+
+ /// This selects other optimizations than can't otherwise be disabled by setting other
+ /// configuration options. This includes:
+ /// - IR optimizations
+ /// - Block linking optimizations
+ /// - RSB optimizations
+ /// This is intended to be used for debugging.
+ OptimizationFlag optimizations = all_safe_optimizations;
+
+ bool HasOptimization(OptimizationFlag f) const {
+ if (!unsafe_optimizations) {
+ f &= all_safe_optimizations;
+ }
+ return (f & optimizations) != no_optimizations;
+ }
+
+ /// This enables unsafe optimizations that reduce emulation accuracy in favour of speed.
+ /// For safety, in order to enable unsafe optimizations you have to set BOTH this flag
+ /// AND the appropriate flag bits above.
+ /// The prefered and tested mode for this library is with unsafe optimizations disabled.
+ bool unsafe_optimizations = false;
+
+ // Page Table
+ // The page table is used for faster memory access. If an entry in the table is nullptr,
+ // the JIT will fallback to calling the MemoryRead*/MemoryWrite* callbacks.
+ static constexpr std::size_t PAGE_BITS = 12;
+ static constexpr std::size_t NUM_PAGE_TABLE_ENTRIES = 1 << (32 - PAGE_BITS);
+ std::array<std::uint8_t*, NUM_PAGE_TABLE_ENTRIES>* page_table = nullptr;
+ /// Determines if the pointer in the page_table shall be offseted locally or globally.
+ /// 'false' will access page_table[addr >> bits][addr & mask]
+ /// 'true' will access page_table[addr >> bits][addr]
+ /// Note: page_table[addr >> bits] will still be checked to verify active pages.
+ /// So there might be wrongly faulted pages which maps to nullptr.
+ /// This can be avoided by carefully allocating the memory region.
+ bool absolute_offset_page_table = false;
+ /// Masks out the first N bits in host pointers from the page table.
+ /// The intention behind this is to allow users of Dynarmic to pack attributes in the
+ /// same integer and update the pointer attribute pair atomically.
+ /// If the configured value is 3, all pointers will be forcefully aligned to 8 bytes.
+ int page_table_pointer_mask_bits = 0;
+ /// Determines if we should detect memory accesses via page_table that straddle are
+ /// misaligned. Accesses that straddle page boundaries will fallback to the relevant
+ /// memory callback.
+ /// This value should be the required access sizes this applies to ORed together.
+ /// To detect any access, use: 8 | 16 | 32 | 64.
+ std::uint8_t detect_misaligned_access_via_page_table = 0;
+ /// Determines if the above option only triggers when the misalignment straddles a
+ /// page boundary.
+ bool only_detect_misalignment_via_page_table_on_page_boundary = false;
+
+ // Fastmem Pointer
+ // This should point to the beginning of a 4GB address space which is in arranged just like
+ // what you wish for emulated memory to be. If the host page faults on an address, the JIT
+ // will fallback to calling the MemoryRead*/MemoryWrite* callbacks.
+ void* fastmem_pointer = nullptr;
+ /// Determines if instructions that pagefault should cause recompilation of that block
+ /// with fastmem disabled.
+ /// Recompiled code will use the page_table if this is available, otherwise memory
+ /// accesses will hit the memory callbacks.
+ bool recompile_on_fastmem_failure = true;
+
+ /// Determines if we should use the above fastmem_pointer for exclusive reads and
+ /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not
+ /// provide fully accurate emulation.
+ bool fastmem_exclusive_access = false;
+ /// Determines if exclusive access instructions that pagefault should cause
+ /// recompilation of that block with fastmem disabled. Recompiled code will use memory
+ /// callbacks.
+ bool recompile_on_exclusive_fastmem_failure = true;
+
+ // Coprocessors
+ std::array<std::shared_ptr<Coprocessor>, 16> coprocessors{};
+
+ /// When set to true, UserCallbacks::InstructionSynchronizationBarrierRaised will be
+ /// called when an ISB instruction is executed.
+ /// When set to false, ISB will be treated as a NOP instruction.
+ bool hook_isb = false;
+
+ /// Hint instructions would cause ExceptionRaised to be called with the appropriate
+ /// argument.
+ bool hook_hint_instructions = false;
+
+ /// This option relates to translation. Generally when we run into an unpredictable
+ /// instruction the ExceptionRaised callback is called. If this is true, we define
+ /// definite behaviour for some unpredictable instructions.
+ bool define_unpredictable_behaviour = false;
+
+ /// HACK:
+ /// This tells the translator a wall clock will be used, thus allowing it
+ /// to avoid writting certain unnecessary code only needed for cycle timers.
+ bool wall_clock_cntpct = false;
+
+ /// This allows accurately emulating protection fault handlers. If true, we check
+ /// for exit after every data memory access by the emulated program.
+ bool check_halt_on_memory_access = false;
+
+ /// This option allows you to disable cycle counting. If this is set to false,
+ /// AddTicks and GetTicksRemaining are never called, and no cycle counting is done.
+ bool enable_cycle_counting = true;
+
+ /// This option relates to the CPSR.E flag. Enabling this option disables modification
+ /// of CPSR.E by the emulated program, forcing it to 0.
+ /// NOTE: Calling Jit::SetCpsr with CPSR.E=1 while this option is enabled may result
+ /// in unusual behavior.
+ bool always_little_endian = false;
+
+ // Minimum size is about 8MiB. Maximum size is about 128MiB (arm64 host) or 2GiB (x64 host).
+ // Maximum size is limited by the maximum length of a x86_64 / arm64 jump.
+ size_t code_cache_size = 128 * 1024 * 1024; // bytes
+
+ /// Internal use only
+ bool very_verbose_debugging_output = false;
+};
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h
new file mode 100644
index 0000000000..7ee2e96ee3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h
@@ -0,0 +1,110 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+
+#include "dynarmic/interface/A32/coprocessor_util.h"
+
+namespace Dynarmic {
+namespace A32 {
+
+class Jit;
+
+class Coprocessor {
+public:
+ virtual ~Coprocessor() = default;
+
+ struct Callback {
+ /**
+ * @param jit CPU state
+ * @param user_arg Set to Callback::user_arg at runtime
+ * @param arg0 Purpose of this argument depends on type of callback.
+ * @param arg1 Purpose of this argument depends on type of callback.
+ * @return Purpose of return value depends on type of callback.
+ */
+ std::uint64_t (*function)(void* user_arg, std::uint32_t arg0, std::uint32_t arg1);
+ /// If std::nullopt, function will be called with a user_arg parameter containing garbage.
+ std::optional<void*> user_arg;
+ };
+
+ /**
+ * std::monostate: coprocessor exception will be compiled
+ * Callback: a call to the Callback will be compiled
+ * std::uint32_t*: a write/read to that memory address will be compiled
+ */
+ using CallbackOrAccessOneWord = std::variant<std::monostate, Callback, std::uint32_t*>;
+
+ /**
+ * std::monostate: coprocessor exception will be compiled
+ * Callback: a call to the Callback will be compiled
+ * std::array<std::uint32_t*, 2>: a write/read to those memory addresses will be compiled
+ */
+ using CallbackOrAccessTwoWords = std::variant<std::monostate, Callback, std::array<std::uint32_t*, 2>>;
+
+ /**
+ * Called when compiling CDP or CDP2 for this coprocessor.
+ * A return value of std::nullopt will cause a coprocessor exception to be compiled.
+ * arg0, arg1 and return value of callback are ignored.
+ */
+ virtual std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0;
+
+ /**
+ * Called when compiling MCR or MCR2 for this coprocessor.
+ * A return value of std::monostate will cause a coprocessor exception to be compiled.
+ * arg0 of the callback will contain the word sent to the coprocessor.
+ * arg1 and return value of the callback are ignored.
+ */
+ virtual CallbackOrAccessOneWord CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0;
+
+ /**
+ * Called when compiling MCRR or MCRR2 for this coprocessor.
+ * A return value of std::monostate will cause a coprocessor exception to be compiled.
+ * arg0 and arg1 of the callback will contain the words sent to the coprocessor.
+ * The return value of the callback is ignored.
+ */
+ virtual CallbackOrAccessTwoWords CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) = 0;
+
+ /**
+ * Called when compiling MRC or MRC2 for this coprocessor.
+ * A return value of std::monostate will cause a coprocessor exception to be compiled.
+ * The return value of the callback should contain word from coprocessor.
+ * The low word of the return value will be stored in Rt.
+ * arg0 and arg1 of the callback are ignored.
+ */
+ virtual CallbackOrAccessOneWord CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0;
+
+ /**
+ * Called when compiling MRRC or MRRC2 for this coprocessor.
+ * A return value of std::monostate will cause a coprocessor exception to be compiled.
+ * The return value of the callback should contain words from coprocessor.
+ * The low word of the return value will be stored in Rt.
+ * The high word of the return value will be stored in Rt2.
+ * arg0 and arg1 of the callback are ignored.
+ */
+ virtual CallbackOrAccessTwoWords CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) = 0;
+
+ /**
+ * Called when compiling LDC or LDC2 for this coprocessor.
+ * A return value of std::nullopt will cause a coprocessor exception to be compiled.
+ * arg0 of the callback will contain the start address.
+ * arg1 and return value of the callback are ignored.
+ */
+ virtual std::optional<Callback> CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd, std::optional<std::uint8_t> option) = 0;
+
+ /**
+ * Called when compiling STC or STC2 for this coprocessor.
+ * A return value of std::nullopt will cause a coprocessor exception to be compiled.
+ * arg0 of the callback will contain the start address.
+ * arg1 and return value of the callback are ignored.
+ */
+ virtual std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, std::optional<std::uint8_t> option) = 0;
+};
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h
new file mode 100644
index 0000000000..ed695d17e9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h
@@ -0,0 +1,31 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic {
+namespace A32 {
+
+enum class CoprocReg {
+ C0,
+ C1,
+ C2,
+ C3,
+ C4,
+ C5,
+ C6,
+ C7,
+ C8,
+ C9,
+ C10,
+ C11,
+ C12,
+ C13,
+ C14,
+ C15
+};
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h b/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h
new file mode 100644
index 0000000000..54058bf5ee
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h
@@ -0,0 +1,18 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace Dynarmic {
+namespace A32 {
+
+std::string DisassembleArm(std::uint32_t instruction);
+std::string DisassembleThumb16(std::uint16_t instruction);
+
+} // namespace A32
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A64/a64.h b/externals/dynarmic/src/dynarmic/interface/A64/a64.h
new file mode 100644
index 0000000000..a150da8448
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A64/a64.h
@@ -0,0 +1,137 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/interface/halt_reason.h"
+
+namespace Dynarmic {
+namespace A64 {
+
+class Jit final {
+public:
+ explicit Jit(UserConfig conf);
+ ~Jit();
+
+ /**
+ * Runs the emulated CPU.
+ * Cannot be recursively called.
+ */
+ HaltReason Run();
+
+ /**
+ * Step the emulated CPU for one instruction.
+ * Cannot be recursively called.
+ */
+ HaltReason Step();
+
+ /**
+ * Clears the code cache of all compiled code.
+ * Can be called at any time. Halts execution if called within a callback.
+ */
+ void ClearCache();
+
+ /**
+ * Invalidate the code cache at a range of addresses.
+ * @param start_address The starting address of the range to invalidate.
+ * @param length The length (in bytes) of the range to invalidate.
+ */
+ void InvalidateCacheRange(std::uint64_t start_address, std::size_t length);
+
+ /**
+ * Reset CPU state to state at startup. Does not clear code cache.
+ * Cannot be called from a callback.
+ */
+ void Reset();
+
+ /**
+ * Stops execution in Jit::Run.
+ */
+ void HaltExecution(HaltReason hr = HaltReason::UserDefined1);
+
+ /**
+ * Clears a halt reason from flags.
+ * Warning: Only use this if you're sure this won't introduce races.
+ */
+ void ClearHalt(HaltReason hr = HaltReason::UserDefined1);
+
+ /// Read Stack Pointer
+ std::uint64_t GetSP() const;
+ /// Modify Stack Pointer
+ void SetSP(std::uint64_t value);
+
+ /// Read Program Counter
+ std::uint64_t GetPC() const;
+ /// Modify Program Counter
+ void SetPC(std::uint64_t value);
+
+ /// Read general-purpose register.
+ std::uint64_t GetRegister(std::size_t index) const;
+ /// Modify general-purpose register.
+ void SetRegister(size_t index, std::uint64_t value);
+
+ /// Read all general-purpose registers.
+ std::array<std::uint64_t, 31> GetRegisters() const;
+ /// Modify all general-purpose registers.
+ void SetRegisters(const std::array<std::uint64_t, 31>& value);
+
+ /// Read floating point and SIMD register.
+ Vector GetVector(std::size_t index) const;
+ /// Modify floating point and SIMD register.
+ void SetVector(std::size_t index, Vector value);
+
+ /// Read all floating point and SIMD registers.
+ std::array<Vector, 32> GetVectors() const;
+ /// Modify all floating point and SIMD registers.
+ void SetVectors(const std::array<Vector, 32>& value);
+
+ /// View FPCR.
+ std::uint32_t GetFpcr() const;
+ /// Modify FPCR.
+ void SetFpcr(std::uint32_t value);
+
+ /// View FPSR.
+ std::uint32_t GetFpsr() const;
+ /// Modify FPSR.
+ void SetFpsr(std::uint32_t value);
+
+ /// View PSTATE
+ std::uint32_t GetPstate() const;
+ /// Modify PSTATE
+ void SetPstate(std::uint32_t value);
+
+ /// Clears exclusive state for this core.
+ void ClearExclusiveState();
+
+ /**
+ * Returns true if Jit::Run was called but hasn't returned yet.
+ * i.e.: We're in a callback.
+ */
+ bool IsExecuting() const;
+
+ /// Debugging: Dump a disassembly all of compiled code to the console.
+ void DumpDisassembly() const;
+
+ /*
+ * Disassemble the instructions following the current pc and return
+ * the resulting instructions as a vector of their string representations.
+ */
+ std::vector<std::string> Disassemble() const;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> impl;
+};
+
+} // namespace A64
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/A64/config.h b/externals/dynarmic/src/dynarmic/interface/A64/config.h
new file mode 100644
index 0000000000..b10c65bc6f
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/A64/config.h
@@ -0,0 +1,297 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "dynarmic/interface/optimization_flags.h"
+
+namespace Dynarmic {
+class ExclusiveMonitor;
+} // namespace Dynarmic
+
+namespace Dynarmic {
+namespace A64 {
+
+using VAddr = std::uint64_t;
+
+using Vector = std::array<std::uint64_t, 2>;
+static_assert(sizeof(Vector) == sizeof(std::uint64_t) * 2, "Vector must be 128 bits in size");
+
+enum class Exception {
+ /// An UndefinedFault occured due to executing instruction with an unallocated encoding
+ UnallocatedEncoding,
+ /// An UndefinedFault occured due to executing instruction containing a reserved value
+ ReservedValue,
+ /// An unpredictable instruction is to be executed. Implementation-defined behaviour should now happen.
+ /// This behaviour is up to the user of this library to define.
+ /// Note: Constraints on unpredictable behaviour are specified in the ARMv8 ARM.
+ UnpredictableInstruction,
+ /// A WFI instruction was executed. You may now enter a low-power state. (Hint instruction.)
+ WaitForInterrupt,
+ /// A WFE instruction was executed. You may now enter a low-power state if the event register is clear. (Hint instruction.)
+ WaitForEvent,
+ /// A SEV instruction was executed. The event register of all PEs should be set. (Hint instruction.)
+ SendEvent,
+ /// A SEVL instruction was executed. The event register of the current PE should be set. (Hint instruction.)
+ SendEventLocal,
+ /// A YIELD instruction was executed. (Hint instruction.)
+ Yield,
+ /// A BRK instruction was executed. (Hint instruction.)
+ Breakpoint,
+ /// Attempted to execute a code block at an address for which MemoryReadCode returned std::nullopt.
+ /// (Intended to be used to emulate memory protection faults.)
+ NoExecuteFault,
+};
+
+enum class DataCacheOperation {
+ /// DC CISW
+ CleanAndInvalidateBySetWay,
+ /// DC CIVAC
+ CleanAndInvalidateByVAToPoC,
+ /// DC CSW
+ CleanBySetWay,
+ /// DC CVAC
+ CleanByVAToPoC,
+ /// DC CVAU
+ CleanByVAToPoU,
+ /// DC CVAP
+ CleanByVAToPoP,
+ /// DC ISW
+ InvalidateBySetWay,
+ /// DC IVAC
+ InvalidateByVAToPoC,
+ /// DC ZVA
+ ZeroByVA,
+};
+
+enum class InstructionCacheOperation {
+ /// IC IVAU
+ InvalidateByVAToPoU,
+ /// IC IALLU
+ InvalidateAllToPoU,
+ /// IC IALLUIS
+ InvalidateAllToPoUInnerSharable
+};
+
+struct UserCallbacks {
+ virtual ~UserCallbacks() = default;
+
+ // All reads through this callback are 4-byte aligned.
+ // Memory must be interpreted as little endian.
+ virtual std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) { return MemoryRead32(vaddr); }
+
+ // Reads through these callbacks may not be aligned.
+ virtual std::uint8_t MemoryRead8(VAddr vaddr) = 0;
+ virtual std::uint16_t MemoryRead16(VAddr vaddr) = 0;
+ virtual std::uint32_t MemoryRead32(VAddr vaddr) = 0;
+ virtual std::uint64_t MemoryRead64(VAddr vaddr) = 0;
+ virtual Vector MemoryRead128(VAddr vaddr) = 0;
+
+ // Writes through these callbacks may not be aligned.
+ virtual void MemoryWrite8(VAddr vaddr, std::uint8_t value) = 0;
+ virtual void MemoryWrite16(VAddr vaddr, std::uint16_t value) = 0;
+ virtual void MemoryWrite32(VAddr vaddr, std::uint32_t value) = 0;
+ virtual void MemoryWrite64(VAddr vaddr, std::uint64_t value) = 0;
+ virtual void MemoryWrite128(VAddr vaddr, Vector value) = 0;
+
+ // Writes through these callbacks may not be aligned.
+ virtual bool MemoryWriteExclusive8(VAddr /*vaddr*/, std::uint8_t /*value*/, std::uint8_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive16(VAddr /*vaddr*/, std::uint16_t /*value*/, std::uint16_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive32(VAddr /*vaddr*/, std::uint32_t /*value*/, std::uint32_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive64(VAddr /*vaddr*/, std::uint64_t /*value*/, std::uint64_t /*expected*/) { return false; }
+ virtual bool MemoryWriteExclusive128(VAddr /*vaddr*/, Vector /*value*/, Vector /*expected*/) { return false; }
+
+ // If this callback returns true, the JIT will assume MemoryRead* callbacks will always
+ // return the same value at any point in time for this vaddr. The JIT may use this information
+ // in optimizations.
+ // A conservative implementation that always returns false is safe.
+ virtual bool IsReadOnlyMemory(VAddr /*vaddr*/) { return false; }
+
+ /// The interpreter must execute exactly num_instructions starting from PC.
+ virtual void InterpreterFallback(VAddr pc, size_t num_instructions) = 0;
+
+ // This callback is called whenever a SVC instruction is executed.
+ virtual void CallSVC(std::uint32_t swi) = 0;
+
+ virtual void ExceptionRaised(VAddr pc, Exception exception) = 0;
+ virtual void DataCacheOperationRaised(DataCacheOperation /*op*/, VAddr /*value*/) {}
+ virtual void InstructionCacheOperationRaised(InstructionCacheOperation /*op*/, VAddr /*value*/) {}
+ virtual void InstructionSynchronizationBarrierRaised() {}
+
+ // Timing-related callbacks
+ // ticks ticks have passed
+ virtual void AddTicks(std::uint64_t ticks) = 0;
+ // How many more ticks am I allowed to execute?
+ virtual std::uint64_t GetTicksRemaining() = 0;
+ // Get value in the emulated counter-timer physical count register.
+ virtual std::uint64_t GetCNTPCT() = 0;
+};
+
+struct UserConfig {
+ UserCallbacks* callbacks;
+
+ size_t processor_id = 0;
+ ExclusiveMonitor* global_monitor = nullptr;
+
+ /// This selects other optimizations than can't otherwise be disabled by setting other
+ /// configuration options. This includes:
+ /// - IR optimizations
+ /// - Block linking optimizations
+ /// - RSB optimizations
+ /// This is intended to be used for debugging.
+ OptimizationFlag optimizations = all_safe_optimizations;
+
+ bool HasOptimization(OptimizationFlag f) const {
+ if (!unsafe_optimizations) {
+ f &= all_safe_optimizations;
+ }
+ return (f & optimizations) != no_optimizations;
+ }
+
+ /// This enables unsafe optimizations that reduce emulation accuracy in favour of speed.
+ /// For safety, in order to enable unsafe optimizations you have to set BOTH this flag
+ /// AND the appropriate flag bits above.
+ /// The prefered and tested mode for this library is with unsafe optimizations disabled.
+ bool unsafe_optimizations = false;
+
+ /// When set to true, UserCallbacks::DataCacheOperationRaised will be called when any
+ /// data cache instruction is executed. Notably DC ZVA will not implicitly do anything.
+ /// When set to false, UserCallbacks::DataCacheOperationRaised will never be called.
+ /// Executing DC ZVA in this mode will result in zeros being written to memory.
+ bool hook_data_cache_operations = false;
+
+ /// When set to true, UserCallbacks::InstructionSynchronizationBarrierRaised will be
+ /// called when an ISB instruction is executed.
+ /// When set to false, ISB will be treated as a NOP instruction.
+ bool hook_isb = false;
+
+ /// When set to true, UserCallbacks::ExceptionRaised will be called when any hint
+ /// instruction is executed.
+ bool hook_hint_instructions = false;
+
+ /// Counter-timer frequency register. The value of the register is not interpreted by
+ /// dynarmic.
+ std::uint32_t cntfrq_el0 = 600000000;
+
+ /// CTR_EL0<27:24> is log2 of the cache writeback granule in words.
+ /// CTR_EL0<23:20> is log2 of the exclusives reservation granule in words.
+ /// CTR_EL0<19:16> is log2 of the smallest data/unified cacheline in words.
+ /// CTR_EL0<15:14> is the level 1 instruction cache policy.
+ /// CTR_EL0<3:0> is log2 of the smallest instruction cacheline in words.
+ std::uint32_t ctr_el0 = 0x8444c004;
+
+ /// DCZID_EL0<3:0> is log2 of the block size in words
+ /// DCZID_EL0<4> is 0 if the DC ZVA instruction is permitted.
+ std::uint32_t dczid_el0 = 4;
+
+ /// Pointer to where TPIDRRO_EL0 is stored. This pointer will be inserted into
+ /// emitted code.
+ const std::uint64_t* tpidrro_el0 = nullptr;
+
+ /// Pointer to where TPIDR_EL0 is stored. This pointer will be inserted into
+ /// emitted code.
+ std::uint64_t* tpidr_el0 = nullptr;
+
+ /// Pointer to the page table which we can use for direct page table access.
+ /// If an entry in page_table is null, the relevant memory callback will be called.
+ /// If page_table is nullptr, all memory accesses hit the memory callbacks.
+ void** page_table = nullptr;
+ /// Declares how many valid address bits are there in virtual addresses.
+ /// Determines the size of page_table. Valid values are between 12 and 64 inclusive.
+ /// This is only used if page_table is not nullptr.
+ size_t page_table_address_space_bits = 36;
+ /// Masks out the first N bits in host pointers from the page table.
+ /// The intention behind this is to allow users of Dynarmic to pack attributes in the
+ /// same integer and update the pointer attribute pair atomically.
+ /// If the configured value is 3, all pointers will be forcefully aligned to 8 bytes.
+ int page_table_pointer_mask_bits = 0;
+ /// Determines what happens if the guest accesses an entry that is off the end of the
+ /// page table. If true, Dynarmic will silently mirror page_table's address space. If
+ /// false, accessing memory outside of page_table bounds will result in a call to the
+ /// relevant memory callback.
+ /// This is only used if page_table is not nullptr.
+ bool silently_mirror_page_table = true;
+ /// Determines if the pointer in the page_table shall be offseted locally or globally.
+ /// 'false' will access page_table[addr >> bits][addr & mask]
+ /// 'true' will access page_table[addr >> bits][addr]
+ /// Note: page_table[addr >> bits] will still be checked to verify active pages.
+ /// So there might be wrongly faulted pages which maps to nullptr.
+ /// This can be avoided by carefully allocating the memory region.
+ bool absolute_offset_page_table = false;
+ /// Determines if we should detect memory accesses via page_table that straddle are
+ /// misaligned. Accesses that straddle page boundaries will fallback to the relevant
+ /// memory callback.
+ /// This value should be the required access sizes this applies to ORed together.
+ /// To detect any access, use: 8 | 16 | 32 | 64 | 128.
+ std::uint8_t detect_misaligned_access_via_page_table = 0;
+ /// Determines if the above option only triggers when the misalignment straddles a
+ /// page boundary.
+ bool only_detect_misalignment_via_page_table_on_page_boundary = false;
+
+ /// Fastmem Pointer
+ /// This should point to the beginning of a 2^page_table_address_space_bits bytes
+ /// address space which is in arranged just like what you wish for emulated memory to
+ /// be. If the host page faults on an address, the JIT will fallback to calling the
+ /// MemoryRead*/MemoryWrite* callbacks.
+ void* fastmem_pointer = nullptr;
+ /// Determines if instructions that pagefault should cause recompilation of that block
+ /// with fastmem disabled.
+ /// Recompiled code will use the page_table if this is available, otherwise memory
+ /// accesses will hit the memory callbacks.
+ bool recompile_on_fastmem_failure = true;
+ /// Declares how many valid address bits are there in virtual addresses.
+ /// Determines the size of fastmem arena. Valid values are between 12 and 64 inclusive.
+ /// This is only used if fastmem_pointer is not nullptr.
+ size_t fastmem_address_space_bits = 36;
+ /// Determines what happens if the guest accesses an entry that is off the end of the
+ /// fastmem arena. If true, Dynarmic will silently mirror fastmem's address space. If
+ /// false, accessing memory outside of fastmem bounds will result in a call to the
+ /// relevant memory callback.
+ /// This is only used if fastmem_pointer is not nullptr.
+ bool silently_mirror_fastmem = true;
+
+ /// Determines if we should use the above fastmem_pointer for exclusive reads and
+ /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not
+ /// provide fully accurate emulation.
+ bool fastmem_exclusive_access = false;
+ /// Determines if exclusive access instructions that pagefault should cause
+ /// recompilation of that block with fastmem disabled. Recompiled code will use memory
+ /// callbacks.
+ bool recompile_on_exclusive_fastmem_failure = true;
+
+ /// This option relates to translation. Generally when we run into an unpredictable
+ /// instruction the ExceptionRaised callback is called. If this is true, we define
+ /// definite behaviour for some unpredictable instructions.
+ bool define_unpredictable_behaviour = false;
+
+ /// HACK:
+ /// This tells the translator a wall clock will be used, thus allowing it
+ /// to avoid writting certain unnecessary code only needed for cycle timers.
+ bool wall_clock_cntpct = false;
+
+ /// This allows accurately emulating protection fault handlers. If true, we check
+ /// for exit after every data memory access by the emulated program.
+ bool check_halt_on_memory_access = false;
+
+ /// This option allows you to disable cycle counting. If this is set to false,
+ /// AddTicks and GetTicksRemaining are never called, and no cycle counting is done.
+ bool enable_cycle_counting = true;
+
+ // Minimum size is about 8MiB. Maximum size is about 128MiB (arm64 host) or 2GiB (x64 host).
+ // Maximum size is limited by the maximum length of a x86_64 / arm64 jump.
+ size_t code_cache_size = 128 * 1024 * 1024; // bytes
+
+ /// Internal use only
+ bool very_verbose_debugging_output = false;
+};
+
+} // namespace A64
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h
new file mode 100644
index 0000000000..4813675873
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h
@@ -0,0 +1,88 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include <dynarmic/common/spin_lock.h>
+
+namespace Dynarmic {
+
+using VAddr = std::uint64_t;
+using Vector = std::array<std::uint64_t, 2>;
+
+class ExclusiveMonitor {
+public:
+ /// @param processor_count Maximum number of processors using this global
+ /// exclusive monitor. Each processor must have a
+ /// unique id.
+ explicit ExclusiveMonitor(size_t processor_count);
+
+ size_t GetProcessorCount() const;
+
+ /// Marks a region containing [address, address+size) to be exclusive to
+ /// processor processor_id.
+ template<typename T, typename Function>
+ T ReadAndMark(size_t processor_id, VAddr address, Function op) {
+ static_assert(std::is_trivially_copyable_v<T>);
+ const VAddr masked_address = address & RESERVATION_GRANULE_MASK;
+
+ Lock();
+ exclusive_addresses[processor_id] = masked_address;
+ const T value = op();
+ std::memcpy(exclusive_values[processor_id].data(), &value, sizeof(T));
+ Unlock();
+ return value;
+ }
+
+ /// Checks to see if processor processor_id has exclusive access to the
+ /// specified region. If it does, executes the operation then clears
+ /// the exclusive state for processors if their exclusive region(s)
+ /// contain [address, address+size).
+ template<typename T, typename Function>
+ bool DoExclusiveOperation(size_t processor_id, VAddr address, Function op) {
+ static_assert(std::is_trivially_copyable_v<T>);
+ if (!CheckAndClear(processor_id, address)) {
+ return false;
+ }
+
+ T saved_value;
+ std::memcpy(&saved_value, exclusive_values[processor_id].data(), sizeof(T));
+ const bool result = op(saved_value);
+
+ Unlock();
+ return result;
+ }
+
+ /// Unmark everything.
+ void Clear();
+ /// Unmark processor id
+ void ClearProcessor(size_t processor_id);
+
+private:
+ bool CheckAndClear(size_t processor_id, VAddr address);
+
+ void Lock();
+ void Unlock();
+
+ friend volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor*);
+ friend size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor*);
+ friend VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor*, size_t index);
+ friend Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor*, size_t index);
+
+ static constexpr VAddr RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFFFull;
+ static constexpr VAddr INVALID_EXCLUSIVE_ADDRESS = 0xDEAD'DEAD'DEAD'DEADull;
+ SpinLock lock;
+ std::vector<VAddr> exclusive_addresses;
+ std::vector<Vector> exclusive_values;
+};
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/halt_reason.h b/externals/dynarmic/src/dynarmic/interface/halt_reason.h
new file mode 100644
index 0000000000..121a8455cc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/halt_reason.h
@@ -0,0 +1,54 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace Dynarmic {
+
+enum class HaltReason : std::uint32_t {
+ Step = 0x00000001,
+ CacheInvalidation = 0x00000002,
+ MemoryAbort = 0x00000004,
+ UserDefined1 = 0x01000000,
+ UserDefined2 = 0x02000000,
+ UserDefined3 = 0x04000000,
+ UserDefined4 = 0x08000000,
+ UserDefined5 = 0x10000000,
+ UserDefined6 = 0x20000000,
+ UserDefined7 = 0x40000000,
+ UserDefined8 = 0x80000000,
+};
+
+constexpr HaltReason operator~(HaltReason hr) {
+ return static_cast<HaltReason>(~static_cast<std::uint32_t>(hr));
+}
+
+constexpr HaltReason operator|(HaltReason hr1, HaltReason hr2) {
+ return static_cast<HaltReason>(static_cast<std::uint32_t>(hr1) | static_cast<std::uint32_t>(hr2));
+}
+
+constexpr HaltReason operator&(HaltReason hr1, HaltReason hr2) {
+ return static_cast<HaltReason>(static_cast<std::uint32_t>(hr1) & static_cast<std::uint32_t>(hr2));
+}
+
+constexpr HaltReason operator|=(HaltReason& result, HaltReason hr) {
+ return result = (result | hr);
+}
+
+constexpr HaltReason operator&=(HaltReason& result, HaltReason hr) {
+ return result = (result & hr);
+}
+
+constexpr bool operator!(HaltReason hr) {
+ return static_cast<std::uint32_t>(hr) == 0;
+}
+
+constexpr bool Has(HaltReason hr1, HaltReason hr2) {
+ return (static_cast<std::uint32_t>(hr1) & static_cast<std::uint32_t>(hr2)) != 0;
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/interface/optimization_flags.h b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h
new file mode 100644
index 0000000000..2f65f0bfa4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h
@@ -0,0 +1,81 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace Dynarmic {
+
+enum class OptimizationFlag : std::uint32_t {
+ /// This optimization avoids dispatcher lookups by allowing emitted basic blocks to jump
+ /// directly to other basic blocks if the destination PC is predictable at JIT-time.
+ /// This is a safe optimization.
+ BlockLinking = 0x00000001,
+ /// This optimization avoids dispatcher lookups by emulating a return stack buffer. This
+ /// allows for function returns and syscall returns to be predicted at runtime.
+ /// This is a safe optimization.
+ ReturnStackBuffer = 0x00000002,
+ /// This optimization enables a two-tiered dispatch system.
+ /// A fast dispatcher (written in assembly) first does a look-up in a small MRU cache.
+ /// If this fails, it falls back to the usual slower dispatcher.
+ /// This is a safe optimization.
+ FastDispatch = 0x00000004,
+ /// This is an IR optimization. This optimization eliminates unnecessary emulated CPU state
+ /// context lookups.
+ /// This is a safe optimization.
+ GetSetElimination = 0x00000008,
+ /// This is an IR optimization. This optimization does constant propagation.
+ /// This is a safe optimization.
+ ConstProp = 0x00000010,
+ /// This is enables miscellaneous safe IR optimizations.
+ MiscIROpt = 0x00000020,
+
+ /// This is an UNSAFE optimization that reduces accuracy of fused multiply-add operations.
+ /// This unfuses fused instructions to improve performance on host CPUs without FMA support.
+ Unsafe_UnfuseFMA = 0x00010000,
+ /// This is an UNSAFE optimization that reduces accuracy of certain floating-point instructions.
+ /// This allows results of FRECPE and FRSQRTE to have **less** error than spec allows.
+ Unsafe_ReducedErrorFP = 0x00020000,
+ /// This is an UNSAFE optimization that causes floating-point instructions to not produce correct NaNs.
+ /// This may also result in inaccurate results when instructions are given certain special values.
+ Unsafe_InaccurateNaN = 0x00040000,
+ /// This is an UNSAFE optimization that causes ASIMD floating-point instructions to be run with incorrect
+ /// rounding modes. This may result in inaccurate results with all floating-point ASIMD instructions.
+ Unsafe_IgnoreStandardFPCRValue = 0x00080000,
+ /// This is an UNSAFE optimization that causes the global monitor to be ignored. This may
+ /// result in unexpected behaviour in multithreaded scenarios, including but not limited
+ /// to data races and deadlocks.
+ Unsafe_IgnoreGlobalMonitor = 0x00100000,
+};
+
+constexpr OptimizationFlag no_optimizations = static_cast<OptimizationFlag>(0);
+constexpr OptimizationFlag all_safe_optimizations = static_cast<OptimizationFlag>(0x0000FFFF);
+
+constexpr OptimizationFlag operator~(OptimizationFlag f) {
+ return static_cast<OptimizationFlag>(~static_cast<std::uint32_t>(f));
+}
+
+constexpr OptimizationFlag operator|(OptimizationFlag f1, OptimizationFlag f2) {
+ return static_cast<OptimizationFlag>(static_cast<std::uint32_t>(f1) | static_cast<std::uint32_t>(f2));
+}
+
+constexpr OptimizationFlag operator&(OptimizationFlag f1, OptimizationFlag f2) {
+ return static_cast<OptimizationFlag>(static_cast<std::uint32_t>(f1) & static_cast<std::uint32_t>(f2));
+}
+
+constexpr OptimizationFlag operator|=(OptimizationFlag& result, OptimizationFlag f) {
+ return result = (result | f);
+}
+
+constexpr OptimizationFlag operator&=(OptimizationFlag& result, OptimizationFlag f) {
+ return result = (result & f);
+}
+
+constexpr bool operator!(OptimizationFlag f) {
+ return f == no_optimizations;
+}
+
+} // namespace Dynarmic
diff --git a/externals/dynarmic/src/dynarmic/ir/acc_type.h b/externals/dynarmic/src/dynarmic/ir/acc_type.h
new file mode 100644
index 0000000000..f259660b68
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/acc_type.h
@@ -0,0 +1,29 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::IR {
+
+enum class AccType {
+ NORMAL,
+ VEC,
+ STREAM,
+ VECSTREAM,
+ ATOMIC,
+ ORDERED,
+ ORDEREDRW,
+ LIMITEDORDERED,
+ UNPRIV,
+ IFETCH,
+ PTW,
+ DC,
+ IC,
+ DCZVA,
+ AT,
+ SWAP, // TODO: Remove
+};
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/basic_block.cpp b/externals/dynarmic/src/dynarmic/ir/basic_block.cpp
new file mode 100644
index 0000000000..cc7eb48ef6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/basic_block.cpp
@@ -0,0 +1,244 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/basic_block.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <map>
+#include <string>
+
+#include <fmt/format.h>
+#include <mcl/assert.hpp>
+
+#include "dynarmic/common/memory_pool.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/frontend/A64/a64_types.h"
+#include "dynarmic/ir/cond.h"
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::IR {
+
+Block::Block(const LocationDescriptor& location)
+ : location{location}, end_location{location}, cond{Cond::AL}, instruction_alloc_pool{std::make_unique<Common::Pool>(sizeof(Inst), 4096)} {}
+
+Block::~Block() = default;
+
+Block::Block(Block&&) = default;
+
+Block& Block::operator=(Block&&) = default;
+
+void Block::AppendNewInst(Opcode opcode, std::initializer_list<IR::Value> args) {
+ PrependNewInst(end(), opcode, args);
+}
+
+Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode opcode, std::initializer_list<Value> args) {
+ IR::Inst* inst = new (instruction_alloc_pool->Alloc()) IR::Inst(opcode);
+ ASSERT(args.size() == inst->NumArgs());
+
+ std::for_each(args.begin(), args.end(), [&inst, index = size_t(0)](const auto& arg) mutable {
+ inst->SetArg(index, arg);
+ index++;
+ });
+
+ return instructions.insert_before(insertion_point, inst);
+}
+
+LocationDescriptor Block::Location() const {
+ return location;
+}
+
+LocationDescriptor Block::EndLocation() const {
+ return end_location;
+}
+
+void Block::SetEndLocation(const LocationDescriptor& descriptor) {
+ end_location = descriptor;
+}
+
+Cond Block::GetCondition() const {
+ return cond;
+}
+
+void Block::SetCondition(Cond condition) {
+ cond = condition;
+}
+
+LocationDescriptor Block::ConditionFailedLocation() const {
+ return *cond_failed;
+}
+
+void Block::SetConditionFailedLocation(LocationDescriptor fail_location) {
+ cond_failed = fail_location;
+}
+
+size_t& Block::ConditionFailedCycleCount() {
+ return cond_failed_cycle_count;
+}
+
+const size_t& Block::ConditionFailedCycleCount() const {
+ return cond_failed_cycle_count;
+}
+
+bool Block::HasConditionFailedLocation() const {
+ return cond_failed.has_value();
+}
+
+Block::InstructionList& Block::Instructions() {
+ return instructions;
+}
+
+const Block::InstructionList& Block::Instructions() const {
+ return instructions;
+}
+
+Terminal Block::GetTerminal() const {
+ return terminal;
+}
+
+void Block::SetTerminal(Terminal term) {
+ ASSERT_MSG(!HasTerminal(), "Terminal has already been set.");
+ terminal = std::move(term);
+}
+
+void Block::ReplaceTerminal(Terminal term) {
+ ASSERT_MSG(HasTerminal(), "Terminal has not been set.");
+ terminal = std::move(term);
+}
+
+bool Block::HasTerminal() const {
+ return terminal.which() != 0;
+}
+
+size_t& Block::CycleCount() {
+ return cycle_count;
+}
+
+const size_t& Block::CycleCount() const {
+ return cycle_count;
+}
+
+static std::string TerminalToString(const Terminal& terminal_variant) {
+ struct : boost::static_visitor<std::string> {
+ std::string operator()(const Term::Invalid&) const {
+ return "<invalid terminal>";
+ }
+ std::string operator()(const Term::Interpret& terminal) const {
+ return fmt::format("Interpret{{{}}}", terminal.next);
+ }
+ std::string operator()(const Term::ReturnToDispatch&) const {
+ return "ReturnToDispatch{}";
+ }
+ std::string operator()(const Term::LinkBlock& terminal) const {
+ return fmt::format("LinkBlock{{{}}}", terminal.next);
+ }
+ std::string operator()(const Term::LinkBlockFast& terminal) const {
+ return fmt::format("LinkBlockFast{{{}}}", terminal.next);
+ }
+ std::string operator()(const Term::PopRSBHint&) const {
+ return "PopRSBHint{}";
+ }
+ std::string operator()(const Term::FastDispatchHint&) const {
+ return "FastDispatchHint{}";
+ }
+ std::string operator()(const Term::If& terminal) const {
+ return fmt::format("If{{{}, {}, {}}}", A64::CondToString(terminal.if_), TerminalToString(terminal.then_), TerminalToString(terminal.else_));
+ }
+ std::string operator()(const Term::CheckBit& terminal) const {
+ return fmt::format("CheckBit{{{}, {}}}", TerminalToString(terminal.then_), TerminalToString(terminal.else_));
+ }
+ std::string operator()(const Term::CheckHalt& terminal) const {
+ return fmt::format("CheckHalt{{{}}}", TerminalToString(terminal.else_));
+ }
+ } visitor;
+
+ return boost::apply_visitor(visitor, terminal_variant);
+}
+
+std::string DumpBlock(const IR::Block& block) {
+ std::string ret;
+
+ ret += fmt::format("Block: location={}\n", block.Location());
+ ret += fmt::format("cycles={}", block.CycleCount());
+ ret += fmt::format(", entry_cond={}", A64::CondToString(block.GetCondition()));
+ if (block.GetCondition() != Cond::AL) {
+ ret += fmt::format(", cond_fail={}", block.ConditionFailedLocation());
+ }
+ ret += '\n';
+
+ const auto arg_to_string = [](const IR::Value& arg) -> std::string {
+ if (arg.IsEmpty()) {
+ return "<null>";
+ } else if (!arg.IsImmediate()) {
+ if (const unsigned name = arg.GetInst()->GetName()) {
+ return fmt::format("%{}", name);
+ }
+ return fmt::format("%<unnamed inst {:016x}>", reinterpret_cast<u64>(arg.GetInst()));
+ }
+ switch (arg.GetType()) {
+ case Type::U1:
+ return fmt::format("#{}", arg.GetU1() ? '1' : '0');
+ case Type::U8:
+ return fmt::format("#{}", arg.GetU8());
+ case Type::U16:
+ return fmt::format("#{:#x}", arg.GetU16());
+ case Type::U32:
+ return fmt::format("#{:#x}", arg.GetU32());
+ case Type::U64:
+ return fmt::format("#{:#x}", arg.GetU64());
+ case Type::A32Reg:
+ return A32::RegToString(arg.GetA32RegRef());
+ case Type::A32ExtReg:
+ return A32::ExtRegToString(arg.GetA32ExtRegRef());
+ case Type::A64Reg:
+ return A64::RegToString(arg.GetA64RegRef());
+ case Type::A64Vec:
+ return A64::VecToString(arg.GetA64VecRef());
+ default:
+ return "<unknown immediate type>";
+ }
+ };
+
+ for (const auto& inst : block) {
+ const Opcode op = inst.GetOpcode();
+
+ ret += fmt::format("[{:016x}] ", reinterpret_cast<u64>(&inst));
+ if (GetTypeOf(op) != Type::Void) {
+ if (inst.GetName()) {
+ ret += fmt::format("%{:<5} = ", inst.GetName());
+ } else {
+ ret += "noname = ";
+ }
+ } else {
+ ret += " "; // '%00000 = ' -> 1 + 5 + 3 = 9 spaces
+ }
+
+ ret += GetNameOf(op);
+
+ const size_t arg_count = GetNumArgsOf(op);
+ for (size_t arg_index = 0; arg_index < arg_count; arg_index++) {
+ const Value arg = inst.GetArg(arg_index);
+
+ ret += arg_index != 0 ? ", " : " ";
+ ret += arg_to_string(arg);
+
+ Type actual_type = arg.GetType();
+ Type expected_type = GetArgTypeOf(op, arg_index);
+ if (!AreTypesCompatible(actual_type, expected_type)) {
+ ret += fmt::format("<type error: {} != {}>", GetNameOf(actual_type), GetNameOf(expected_type));
+ }
+ }
+
+ ret += fmt::format(" (uses: {})", inst.UseCount());
+
+ ret += '\n';
+ }
+
+ ret += "terminal = " + TerminalToString(block.GetTerminal()) + '\n';
+
+ return ret;
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/basic_block.h b/externals/dynarmic/src/dynarmic/ir/basic_block.h
new file mode 100644
index 0000000000..18d7fadae5
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/basic_block.h
@@ -0,0 +1,168 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include <mcl/container/intrusive_list.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/location_descriptor.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/terminal.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::Common {
+class Pool;
+}
+
+namespace Dynarmic::IR {
+
+enum class Cond;
+enum class Opcode;
+
+/**
+ * A basic block. It consists of zero or more instructions followed by exactly one terminal.
+ * Note that this is a linear IR and not a pure tree-based IR: i.e.: there is an ordering to
+ * the microinstructions. This only matters before chaining is done in order to correctly
+ * order memory accesses.
+ */
+class Block final {
+public:
+ using InstructionList = mcl::intrusive_list<Inst>;
+ using size_type = InstructionList::size_type;
+ using iterator = InstructionList::iterator;
+ using const_iterator = InstructionList::const_iterator;
+ using reverse_iterator = InstructionList::reverse_iterator;
+ using const_reverse_iterator = InstructionList::const_reverse_iterator;
+
+ explicit Block(const LocationDescriptor& location);
+ ~Block();
+
+ Block(const Block&) = delete;
+ Block& operator=(const Block&) = delete;
+
+ Block(Block&&);
+ Block& operator=(Block&&);
+
+ bool empty() const { return instructions.empty(); }
+ size_type size() const { return instructions.size(); }
+
+ Inst& front() { return instructions.front(); }
+ const Inst& front() const { return instructions.front(); }
+
+ Inst& back() { return instructions.back(); }
+ const Inst& back() const { return instructions.back(); }
+
+ iterator begin() { return instructions.begin(); }
+ const_iterator begin() const { return instructions.begin(); }
+ iterator end() { return instructions.end(); }
+ const_iterator end() const { return instructions.end(); }
+
+ reverse_iterator rbegin() { return instructions.rbegin(); }
+ const_reverse_iterator rbegin() const { return instructions.rbegin(); }
+ reverse_iterator rend() { return instructions.rend(); }
+ const_reverse_iterator rend() const { return instructions.rend(); }
+
+ const_iterator cbegin() const { return instructions.cbegin(); }
+ const_iterator cend() const { return instructions.cend(); }
+
+ const_reverse_iterator crbegin() const { return instructions.crbegin(); }
+ const_reverse_iterator crend() const { return instructions.crend(); }
+
+ /**
+ * Appends a new instruction to the end of this basic block,
+ * handling any allocations necessary to do so.
+ *
+ * @param op Opcode representing the instruction to add.
+ * @param args A sequence of Value instances used as arguments for the instruction.
+ */
+ void AppendNewInst(Opcode op, std::initializer_list<Value> args);
+
+ /**
+ * Prepends a new instruction to this basic block before the insertion point,
+ * handling any allocations necessary to do so.
+ *
+ * @param insertion_point Where to insert the new instruction.
+ * @param op Opcode representing the instruction to add.
+ * @param args A sequence of Value instances used as arguments for the instruction.
+ * @returns Iterator to the newly created instruction.
+ */
+ iterator PrependNewInst(iterator insertion_point, Opcode op, std::initializer_list<Value> args);
+
+ /// Gets the starting location for this basic block.
+ LocationDescriptor Location() const;
+ /// Gets the end location for this basic block.
+ LocationDescriptor EndLocation() const;
+ /// Sets the end location for this basic block.
+ void SetEndLocation(const LocationDescriptor& descriptor);
+
+ /// Gets the condition required to pass in order to execute this block.
+ Cond GetCondition() const;
+ /// Sets the condition required to pass in order to execute this block.
+ void SetCondition(Cond condition);
+
+ /// Gets the location of the block to execute if the predicated condition fails.
+ LocationDescriptor ConditionFailedLocation() const;
+ /// Sets the location of the block to execute if the predicated condition fails.
+ void SetConditionFailedLocation(LocationDescriptor fail_location);
+ /// Determines whether or not a predicated condition failure block is present.
+ bool HasConditionFailedLocation() const;
+
+ /// Gets a mutable reference to the condition failed cycle count.
+ size_t& ConditionFailedCycleCount();
+ /// Gets an immutable reference to the condition failed cycle count.
+ const size_t& ConditionFailedCycleCount() const;
+
+ /// Gets a mutable reference to the instruction list for this basic block.
+ InstructionList& Instructions();
+ /// Gets an immutable reference to the instruction list for this basic block.
+ const InstructionList& Instructions() const;
+
+ /// Gets the terminal instruction for this basic block.
+ Terminal GetTerminal() const;
+ /// Sets the terminal instruction for this basic block.
+ void SetTerminal(Terminal term);
+ /// Replaces the terminal instruction for this basic block.
+ void ReplaceTerminal(Terminal term);
+ /// Determines whether or not this basic block has a terminal instruction.
+ bool HasTerminal() const;
+
+ /// Gets a mutable reference to the cycle count for this basic block.
+ size_t& CycleCount();
+ /// Gets an immutable reference to the cycle count for this basic block.
+ const size_t& CycleCount() const;
+
+private:
+ /// Description of the starting location of this block
+ LocationDescriptor location;
+ /// Description of the end location of this block
+ LocationDescriptor end_location;
+ /// Conditional to pass in order to execute this block
+ Cond cond;
+ /// Block to execute next if `cond` did not pass.
+ std::optional<LocationDescriptor> cond_failed = {};
+ /// Number of cycles this block takes to execute if the conditional fails.
+ size_t cond_failed_cycle_count = 0;
+
+ /// List of instructions in this block.
+ InstructionList instructions;
+ /// Memory pool for instruction list
+ std::unique_ptr<Common::Pool> instruction_alloc_pool;
+ /// Terminal instruction of this block.
+ Terminal terminal = Term::Invalid{};
+
+ /// Number of cycles this block takes to execute.
+ size_t cycle_count = 0;
+};
+
+/// Returns a string representation of the contents of block. Intended for debugging.
+std::string DumpBlock(const IR::Block& block);
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/cond.h b/externals/dynarmic/src/dynarmic/ir/cond.h
new file mode 100644
index 0000000000..5bdd05eddc
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/cond.h
@@ -0,0 +1,31 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::IR {
+
+enum class Cond {
+ EQ,
+ NE,
+ CS,
+ CC,
+ MI,
+ PL,
+ VS,
+ VC,
+ HI,
+ LS,
+ GE,
+ LT,
+ GT,
+ LE,
+ AL,
+ NV,
+ HS = CS,
+ LO = CC,
+};
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
new file mode 100644
index 0000000000..fc4f69b3e0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
@@ -0,0 +1,2890 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/ir_emitter.h"
+
+#include <vector>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit_cast.hpp>
+
+#include "dynarmic/ir/opcodes.h"
+
+namespace Dynarmic::IR {
+
+U1 IREmitter::Imm1(bool imm1) const {
+ return U1(Value(imm1));
+}
+
+U8 IREmitter::Imm8(u8 imm8) const {
+ return U8(Value(imm8));
+}
+
+U16 IREmitter::Imm16(u16 imm16) const {
+ return U16(Value(imm16));
+}
+
+U32 IREmitter::Imm32(u32 imm32) const {
+ return U32(Value(imm32));
+}
+
+U64 IREmitter::Imm64(u64 imm64) const {
+ return U64(Value(imm64));
+}
+
+void IREmitter::PushRSB(const LocationDescriptor& return_location) {
+ Inst(Opcode::PushRSB, IR::Value(return_location.Value()));
+}
+
+U64 IREmitter::Pack2x32To1x64(const U32& lo, const U32& hi) {
+ return Inst<U64>(Opcode::Pack2x32To1x64, lo, hi);
+}
+
+U128 IREmitter::Pack2x64To1x128(const U64& lo, const U64& hi) {
+ return Inst<U128>(Opcode::Pack2x64To1x128, lo, hi);
+}
+
+UAny IREmitter::LeastSignificant(size_t bitsize, const U32U64& value) {
+ switch (bitsize) {
+ case 8:
+ return LeastSignificantByte(value);
+ case 16:
+ return LeastSignificantHalf(value);
+ case 32:
+ if (value.GetType() == Type::U32) {
+ return value;
+ }
+ return LeastSignificantWord(value);
+ case 64:
+ ASSERT(value.GetType() == Type::U64);
+ return value;
+ }
+ ASSERT_FALSE("Invalid bitsize");
+}
+
+U32 IREmitter::LeastSignificantWord(const U64& value) {
+ return Inst<U32>(Opcode::LeastSignificantWord, value);
+}
+
+U16 IREmitter::LeastSignificantHalf(U32U64 value) {
+ if (value.GetType() == Type::U64) {
+ value = LeastSignificantWord(value);
+ }
+ return Inst<U16>(Opcode::LeastSignificantHalf, value);
+}
+
+U8 IREmitter::LeastSignificantByte(U32U64 value) {
+ if (value.GetType() == Type::U64) {
+ value = LeastSignificantWord(value);
+ }
+ return Inst<U8>(Opcode::LeastSignificantByte, value);
+}
+
+ResultAndCarry<U32> IREmitter::MostSignificantWord(const U64& value) {
+ const auto result = Inst<U32>(Opcode::MostSignificantWord, value);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+U1 IREmitter::MostSignificantBit(const U32& value) {
+ return Inst<U1>(Opcode::MostSignificantBit, value);
+}
+
+U1 IREmitter::IsZero(const U32& value) {
+ return Inst<U1>(Opcode::IsZero32, value);
+}
+
+U1 IREmitter::IsZero(const U64& value) {
+ return Inst<U1>(Opcode::IsZero64, value);
+}
+
+U1 IREmitter::IsZero(const U32U64& value) {
+ if (value.GetType() == Type::U32) {
+ return Inst<U1>(Opcode::IsZero32, value);
+ } else {
+ return Inst<U1>(Opcode::IsZero64, value);
+ }
+}
+
+U1 IREmitter::TestBit(const U32U64& value, const U8& bit) {
+ if (value.GetType() == Type::U32) {
+ return Inst<U1>(Opcode::TestBit, IndeterminateExtendToLong(value), bit);
+ } else {
+ return Inst<U1>(Opcode::TestBit, value, bit);
+ }
+}
+
+U32 IREmitter::ConditionalSelect(Cond cond, const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::ConditionalSelect32, Value{cond}, a, b);
+}
+
+U64 IREmitter::ConditionalSelect(Cond cond, const U64& a, const U64& b) {
+ return Inst<U64>(Opcode::ConditionalSelect64, Value{cond}, a, b);
+}
+
+NZCV IREmitter::ConditionalSelect(Cond cond, const NZCV& a, const NZCV& b) {
+ return Inst<NZCV>(Opcode::ConditionalSelectNZCV, Value{cond}, a, b);
+}
+
+U32U64 IREmitter::ConditionalSelect(Cond cond, const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::ConditionalSelect32, Value{cond}, a, b);
+ } else {
+ return Inst<U64>(Opcode::ConditionalSelect64, Value{cond}, a, b);
+ }
+}
+
+U1 IREmitter::GetCFlagFromNZCV(const NZCV& nzcv) {
+ return Inst<U1>(Opcode::GetCFlagFromNZCV, nzcv);
+}
+
+NZCV IREmitter::NZCVFromPackedFlags(const U32& a) {
+ return Inst<NZCV>(Opcode::NZCVFromPackedFlags, a);
+}
+
+NZCV IREmitter::NZCVFrom(const Value& value) {
+ return Inst<NZCV>(Opcode::GetNZCVFromOp, value);
+}
+
+ResultAndCarry<U32> IREmitter::LogicalShiftLeft(const U32& value_in, const U8& shift_amount, const U1& carry_in) {
+ const auto result = Inst<U32>(Opcode::LogicalShiftLeft32, value_in, shift_amount, carry_in);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+ResultAndCarry<U32> IREmitter::LogicalShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) {
+ const auto result = Inst<U32>(Opcode::LogicalShiftRight32, value_in, shift_amount, carry_in);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+ResultAndCarry<U32> IREmitter::ArithmeticShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) {
+ const auto result = Inst<U32>(Opcode::ArithmeticShiftRight32, value_in, shift_amount, carry_in);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+ResultAndCarry<U32> IREmitter::RotateRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) {
+ const auto result = Inst<U32>(Opcode::RotateRight32, value_in, shift_amount, carry_in);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+ResultAndCarry<U32> IREmitter::RotateRightExtended(const U32& value_in, const U1& carry_in) {
+ const auto result = Inst<U32>(Opcode::RotateRightExtended, value_in, carry_in);
+ const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result);
+ return {result, carry_out};
+}
+
+U32U64 IREmitter::LogicalShiftLeft(const U32U64& value_in, const U8& shift_amount) {
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::LogicalShiftLeft32, value_in, shift_amount, Imm1(0));
+ } else {
+ return Inst<U64>(Opcode::LogicalShiftLeft64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::LogicalShiftRight(const U32U64& value_in, const U8& shift_amount) {
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::LogicalShiftRight32, value_in, shift_amount, Imm1(0));
+ } else {
+ return Inst<U64>(Opcode::LogicalShiftRight64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::ArithmeticShiftRight(const U32U64& value_in, const U8& shift_amount) {
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::ArithmeticShiftRight32, value_in, shift_amount, Imm1(0));
+ } else {
+ return Inst<U64>(Opcode::ArithmeticShiftRight64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::RotateRight(const U32U64& value_in, const U8& shift_amount) {
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::RotateRight32, value_in, shift_amount, Imm1(0));
+ } else {
+ return Inst<U64>(Opcode::RotateRight64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::LogicalShiftLeftMasked(const U32U64& value_in, const U32U64& shift_amount) {
+ ASSERT(value_in.GetType() == shift_amount.GetType());
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::LogicalShiftLeftMasked32, value_in, shift_amount);
+ } else {
+ return Inst<U64>(Opcode::LogicalShiftLeftMasked64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::LogicalShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount) {
+ ASSERT(value_in.GetType() == shift_amount.GetType());
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::LogicalShiftRightMasked32, value_in, shift_amount);
+ } else {
+ return Inst<U64>(Opcode::LogicalShiftRightMasked64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::ArithmeticShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount) {
+ ASSERT(value_in.GetType() == shift_amount.GetType());
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::ArithmeticShiftRightMasked32, value_in, shift_amount);
+ } else {
+ return Inst<U64>(Opcode::ArithmeticShiftRightMasked64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::RotateRightMasked(const U32U64& value_in, const U32U64& shift_amount) {
+ ASSERT(value_in.GetType() == shift_amount.GetType());
+ if (value_in.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::RotateRightMasked32, value_in, shift_amount);
+ } else {
+ return Inst<U64>(Opcode::RotateRightMasked64, value_in, shift_amount);
+ }
+}
+
+U32U64 IREmitter::AddWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Add32, a, b, carry_in);
+ } else {
+ return Inst<U64>(Opcode::Add64, a, b, carry_in);
+ }
+}
+
+U32U64 IREmitter::Add(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Add32, a, b, Imm1(0));
+ } else {
+ return Inst<U64>(Opcode::Add64, a, b, Imm1(0));
+ }
+}
+
+U32U64 IREmitter::SubWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Sub32, a, b, carry_in);
+ } else {
+ return Inst<U64>(Opcode::Sub64, a, b, carry_in);
+ }
+}
+
+U32U64 IREmitter::Sub(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Sub32, a, b, Imm1(1));
+ } else {
+ return Inst<U64>(Opcode::Sub64, a, b, Imm1(1));
+ }
+}
+
+U32U64 IREmitter::Mul(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Mul32, a, b);
+ }
+
+ return Inst<U64>(Opcode::Mul64, a, b);
+}
+
+U64 IREmitter::UnsignedMultiplyHigh(const U64& a, const U64& b) {
+ return Inst<U64>(Opcode::UnsignedMultiplyHigh64, a, b);
+}
+
+U64 IREmitter::SignedMultiplyHigh(const U64& a, const U64& b) {
+ return Inst<U64>(Opcode::SignedMultiplyHigh64, a, b);
+}
+
+U32U64 IREmitter::UnsignedDiv(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::UnsignedDiv32, a, b);
+ }
+
+ return Inst<U64>(Opcode::UnsignedDiv64, a, b);
+}
+
+U32U64 IREmitter::SignedDiv(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::SignedDiv32, a, b);
+ }
+
+ return Inst<U64>(Opcode::SignedDiv64, a, b);
+}
+
+U32U64 IREmitter::And(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::And32, a, b);
+ } else {
+ return Inst<U64>(Opcode::And64, a, b);
+ }
+}
+
+U32U64 IREmitter::AndNot(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::AndNot32, a, b);
+ } else {
+ return Inst<U64>(Opcode::AndNot64, a, b);
+ }
+}
+
+U32U64 IREmitter::Eor(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Eor32, a, b);
+ } else {
+ return Inst<U64>(Opcode::Eor64, a, b);
+ }
+}
+
+U32U64 IREmitter::Or(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Or32, a, b);
+ } else {
+ return Inst<U64>(Opcode::Or64, a, b);
+ }
+}
+
+U32U64 IREmitter::Not(const U32U64& a) {
+ if (a.GetType() == Type::U32) {
+ return Inst<U32>(Opcode::Not32, a);
+ } else {
+ return Inst<U64>(Opcode::Not64, a);
+ }
+}
+
+U64 IREmitter::SignExtendToLong(const UAny& a) {
+ switch (a.GetType()) {
+ case Type::U8:
+ return Inst<U64>(Opcode::SignExtendByteToLong, a);
+ case Type::U16:
+ return Inst<U64>(Opcode::SignExtendHalfToLong, a);
+ case Type::U32:
+ return Inst<U64>(Opcode::SignExtendWordToLong, a);
+ case Type::U64:
+ return U64(a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::SignExtendToWord(const UAny& a) {
+ switch (a.GetType()) {
+ case Type::U8:
+ return Inst<U32>(Opcode::SignExtendByteToWord, a);
+ case Type::U16:
+ return Inst<U32>(Opcode::SignExtendHalfToWord, a);
+ case Type::U32:
+ return U32(a);
+ case Type::U64:
+ return Inst<U32>(Opcode::LeastSignificantWord, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U64 IREmitter::SignExtendWordToLong(const U32& a) {
+ return Inst<U64>(Opcode::SignExtendWordToLong, a);
+}
+
+U32 IREmitter::SignExtendHalfToWord(const U16& a) {
+ return Inst<U32>(Opcode::SignExtendHalfToWord, a);
+}
+
+U32 IREmitter::SignExtendByteToWord(const U8& a) {
+ return Inst<U32>(Opcode::SignExtendByteToWord, a);
+}
+
+U64 IREmitter::ZeroExtendToLong(const UAny& a) {
+ switch (a.GetType()) {
+ case Type::U8:
+ return Inst<U64>(Opcode::ZeroExtendByteToLong, a);
+ case Type::U16:
+ return Inst<U64>(Opcode::ZeroExtendHalfToLong, a);
+ case Type::U32:
+ return Inst<U64>(Opcode::ZeroExtendWordToLong, a);
+ case Type::U64:
+ return U64(a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::ZeroExtendToWord(const UAny& a) {
+ switch (a.GetType()) {
+ case Type::U8:
+ return Inst<U32>(Opcode::ZeroExtendByteToWord, a);
+ case Type::U16:
+ return Inst<U32>(Opcode::ZeroExtendHalfToWord, a);
+ case Type::U32:
+ return U32(a);
+ case Type::U64:
+ return Inst<U32>(Opcode::LeastSignificantWord, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::ZeroExtendToQuad(const UAny& a) {
+ return Inst<U128>(Opcode::ZeroExtendLongToQuad, ZeroExtendToLong(a));
+}
+
+U64 IREmitter::ZeroExtendWordToLong(const U32& a) {
+ return Inst<U64>(Opcode::ZeroExtendWordToLong, a);
+}
+
+U32 IREmitter::ZeroExtendHalfToWord(const U16& a) {
+ return Inst<U32>(Opcode::ZeroExtendHalfToWord, a);
+}
+
+U32 IREmitter::ZeroExtendByteToWord(const U8& a) {
+ return Inst<U32>(Opcode::ZeroExtendByteToWord, a);
+}
+
+U32 IREmitter::IndeterminateExtendToWord(const UAny& a) {
+ // TODO: Implement properly
+ return ZeroExtendToWord(a);
+}
+
+U64 IREmitter::IndeterminateExtendToLong(const UAny& a) {
+ // TODO: Implement properly
+ return ZeroExtendToLong(a);
+}
+
+U32 IREmitter::ByteReverseWord(const U32& a) {
+ return Inst<U32>(Opcode::ByteReverseWord, a);
+}
+
+U16 IREmitter::ByteReverseHalf(const U16& a) {
+ return Inst<U16>(Opcode::ByteReverseHalf, a);
+}
+
+U64 IREmitter::ByteReverseDual(const U64& a) {
+ return Inst<U64>(Opcode::ByteReverseDual, a);
+}
+
+U32U64 IREmitter::CountLeadingZeros(const U32U64& a) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::CountLeadingZeros32, a);
+ }
+
+ return Inst<U64>(Opcode::CountLeadingZeros64, a);
+}
+
+U32U64 IREmitter::ExtractRegister(const U32U64& a, const U32U64& b, const U8& lsb) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::ExtractRegister32, a, b, lsb);
+ }
+
+ return Inst<U64>(Opcode::ExtractRegister64, a, b, lsb);
+}
+
+U32U64 IREmitter::ReplicateBit(const U32U64& a, u8 bit) {
+ if (a.GetType() == IR::Type::U32) {
+ ASSERT(bit < 32);
+ return Inst<U32>(Opcode::ReplicateBit32, a, Imm8(bit));
+ }
+
+ ASSERT(bit < 64);
+ return Inst<U64>(Opcode::ReplicateBit64, a, Imm8(bit));
+}
+
+U32U64 IREmitter::MaxSigned(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::MaxSigned32, a, b);
+ }
+
+ return Inst<U64>(Opcode::MaxSigned64, a, b);
+}
+
+U32U64 IREmitter::MaxUnsigned(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::MaxUnsigned32, a, b);
+ }
+
+ return Inst<U64>(Opcode::MaxUnsigned64, a, b);
+}
+
+U32U64 IREmitter::MinSigned(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::MinSigned32, a, b);
+ }
+
+ return Inst<U64>(Opcode::MinSigned64, a, b);
+}
+
+U32U64 IREmitter::MinUnsigned(const U32U64& a, const U32U64& b) {
+ if (a.GetType() == IR::Type::U32) {
+ return Inst<U32>(Opcode::MinUnsigned32, a, b);
+ }
+
+ return Inst<U64>(Opcode::MinUnsigned64, a, b);
+}
+
+ResultAndOverflow<U32> IREmitter::SignedSaturatedAddWithFlag(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::SignedSaturatedAddWithFlag32, a, b);
+ const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result);
+ return {result, overflow};
+}
+
+ResultAndOverflow<U32> IREmitter::SignedSaturatedSubWithFlag(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::SignedSaturatedSubWithFlag32, a, b);
+ const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result);
+ return {result, overflow};
+}
+
+ResultAndOverflow<U32> IREmitter::SignedSaturation(const U32& a, size_t bit_size_to_saturate_to) {
+ ASSERT(bit_size_to_saturate_to >= 1 && bit_size_to_saturate_to <= 32);
+ const auto result = Inst<U32>(Opcode::SignedSaturation, a, Imm8(static_cast<u8>(bit_size_to_saturate_to)));
+ const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result);
+ return {result, overflow};
+}
+
+ResultAndOverflow<U32> IREmitter::UnsignedSaturation(const U32& a, size_t bit_size_to_saturate_to) {
+ ASSERT(bit_size_to_saturate_to <= 31);
+ const auto result = Inst<U32>(Opcode::UnsignedSaturation, a, Imm8(static_cast<u8>(bit_size_to_saturate_to)));
+ const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result);
+ return {result, overflow};
+}
+
+UAny IREmitter::SignedSaturatedAdd(const UAny& a, const UAny& b) {
+ ASSERT(a.GetType() == b.GetType());
+ const auto result = [&]() -> IR::UAny {
+ switch (a.GetType()) {
+ case IR::Type::U8:
+ return Inst<U8>(Opcode::SignedSaturatedAdd8, a, b);
+ case IR::Type::U16:
+ return Inst<U16>(Opcode::SignedSaturatedAdd16, a, b);
+ case IR::Type::U32:
+ return Inst<U32>(Opcode::SignedSaturatedAdd32, a, b);
+ case IR::Type::U64:
+ return Inst<U64>(Opcode::SignedSaturatedAdd64, a, b);
+ default:
+ return IR::UAny{};
+ }
+ }();
+ return result;
+}
+
+UAny IREmitter::SignedSaturatedDoublingMultiplyReturnHigh(const UAny& a, const UAny& b) {
+ ASSERT(a.GetType() == b.GetType());
+ const auto result = [&]() -> IR::UAny {
+ switch (a.GetType()) {
+ case IR::Type::U16:
+ return Inst<U16>(Opcode::SignedSaturatedDoublingMultiplyReturnHigh16, a, b);
+ case IR::Type::U32:
+ return Inst<U32>(Opcode::SignedSaturatedDoublingMultiplyReturnHigh32, a, b);
+ default:
+ UNREACHABLE();
+ }
+ }();
+ return result;
+}
+
+UAny IREmitter::SignedSaturatedSub(const UAny& a, const UAny& b) {
+ ASSERT(a.GetType() == b.GetType());
+ const auto result = [&]() -> IR::UAny {
+ switch (a.GetType()) {
+ case IR::Type::U8:
+ return Inst<U8>(Opcode::SignedSaturatedSub8, a, b);
+ case IR::Type::U16:
+ return Inst<U16>(Opcode::SignedSaturatedSub16, a, b);
+ case IR::Type::U32:
+ return Inst<U32>(Opcode::SignedSaturatedSub32, a, b);
+ case IR::Type::U64:
+ return Inst<U64>(Opcode::SignedSaturatedSub64, a, b);
+ default:
+ return IR::UAny{};
+ }
+ }();
+ return result;
+}
+
+UAny IREmitter::UnsignedSaturatedAdd(const UAny& a, const UAny& b) {
+ ASSERT(a.GetType() == b.GetType());
+ const auto result = [&]() -> IR::UAny {
+ switch (a.GetType()) {
+ case IR::Type::U8:
+ return Inst<U8>(Opcode::UnsignedSaturatedAdd8, a, b);
+ case IR::Type::U16:
+ return Inst<U16>(Opcode::UnsignedSaturatedAdd16, a, b);
+ case IR::Type::U32:
+ return Inst<U32>(Opcode::UnsignedSaturatedAdd32, a, b);
+ case IR::Type::U64:
+ return Inst<U64>(Opcode::UnsignedSaturatedAdd64, a, b);
+ default:
+ return IR::UAny{};
+ }
+ }();
+ return result;
+}
+
+UAny IREmitter::UnsignedSaturatedSub(const UAny& a, const UAny& b) {
+ ASSERT(a.GetType() == b.GetType());
+ const auto result = [&]() -> IR::UAny {
+ switch (a.GetType()) {
+ case IR::Type::U8:
+ return Inst<U8>(Opcode::UnsignedSaturatedSub8, a, b);
+ case IR::Type::U16:
+ return Inst<U16>(Opcode::UnsignedSaturatedSub16, a, b);
+ case IR::Type::U32:
+ return Inst<U32>(Opcode::UnsignedSaturatedSub32, a, b);
+ case IR::Type::U64:
+ return Inst<U64>(Opcode::UnsignedSaturatedSub64, a, b);
+ default:
+ return IR::UAny{};
+ }
+ }();
+ return result;
+}
+
+U128 IREmitter::VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAdd8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAdd16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAdd32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAdd64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedSub8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedSub16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedSub32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedSub64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedSub8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedSub16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedSub32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedSub64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+ResultAndGE<U32> IREmitter::PackedAddU8(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddU8, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedAddS8(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddS8, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedAddU16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddU16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedAddS16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddS16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubU8(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubU8, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubS8(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubS8, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubU16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubU16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubS16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubS16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedAddSubU16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddSubU16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedAddSubS16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedAddSubS16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubAddU16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubAddU16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+ResultAndGE<U32> IREmitter::PackedSubAddS16(const U32& a, const U32& b) {
+ const auto result = Inst<U32>(Opcode::PackedSubAddS16, a, b);
+ const auto ge = Inst<U32>(Opcode::GetGEFromOp, result);
+ return {result, ge};
+}
+
+U32 IREmitter::PackedHalvingAddU8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddU8, a, b);
+}
+
+U32 IREmitter::PackedHalvingAddS8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddS8, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubU8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubU8, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubS8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubS8, a, b);
+}
+
+U32 IREmitter::PackedHalvingAddU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddU16, a, b);
+}
+
+U32 IREmitter::PackedHalvingAddS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddS16, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubU16, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubS16, a, b);
+}
+
+U32 IREmitter::PackedHalvingAddSubU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddSubU16, a, b);
+}
+
+U32 IREmitter::PackedHalvingAddSubS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingAddSubS16, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubAddU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubAddU16, a, b);
+}
+
+U32 IREmitter::PackedHalvingSubAddS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedHalvingSubAddS16, a, b);
+}
+
+U32 IREmitter::PackedSaturatedAddU8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedAddU8, a, b);
+}
+
+U32 IREmitter::PackedSaturatedAddS8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedAddS8, a, b);
+}
+
+U32 IREmitter::PackedSaturatedSubU8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedSubU8, a, b);
+}
+
+U32 IREmitter::PackedSaturatedSubS8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedSubS8, a, b);
+}
+
+U32 IREmitter::PackedSaturatedAddU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedAddU16, a, b);
+}
+
+U32 IREmitter::PackedSaturatedAddS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedAddS16, a, b);
+}
+
+U32 IREmitter::PackedSaturatedSubU16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedSubU16, a, b);
+}
+
+U32 IREmitter::PackedSaturatedSubS16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSaturatedSubS16, a, b);
+}
+
+U32 IREmitter::PackedAbsDiffSumU8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedAbsDiffSumU8, a, b);
+}
+
+U32 IREmitter::PackedSelect(const U32& ge, const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::PackedSelect, ge, a, b);
+}
+
+U32 IREmitter::CRC32Castagnoli8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32Castagnoli8, a, b);
+}
+
+U32 IREmitter::CRC32Castagnoli16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32Castagnoli16, a, b);
+}
+
+U32 IREmitter::CRC32Castagnoli32(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32Castagnoli32, a, b);
+}
+
+U32 IREmitter::CRC32Castagnoli64(const U32& a, const U64& b) {
+ return Inst<U32>(Opcode::CRC32Castagnoli64, a, b);
+}
+
+U32 IREmitter::CRC32ISO8(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32ISO8, a, b);
+}
+
+U32 IREmitter::CRC32ISO16(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32ISO16, a, b);
+}
+
+U32 IREmitter::CRC32ISO32(const U32& a, const U32& b) {
+ return Inst<U32>(Opcode::CRC32ISO32, a, b);
+}
+
+U32 IREmitter::CRC32ISO64(const U32& a, const U64& b) {
+ return Inst<U32>(Opcode::CRC32ISO64, a, b);
+}
+
+U128 IREmitter::AESDecryptSingleRound(const U128& a) {
+ return Inst<U128>(Opcode::AESDecryptSingleRound, a);
+}
+
+U128 IREmitter::AESEncryptSingleRound(const U128& a) {
+ return Inst<U128>(Opcode::AESEncryptSingleRound, a);
+}
+
+U128 IREmitter::AESInverseMixColumns(const U128& a) {
+ return Inst<U128>(Opcode::AESInverseMixColumns, a);
+}
+
+U128 IREmitter::AESMixColumns(const U128& a) {
+ return Inst<U128>(Opcode::AESMixColumns, a);
+}
+
+U8 IREmitter::SM4AccessSubstitutionBox(const U8& a) {
+ return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a);
+}
+
+U128 IREmitter::SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1) {
+ return Inst<U128>(Opcode::SHA256Hash, x, y, w, Imm1(part1));
+}
+
+U128 IREmitter::SHA256MessageSchedule0(const U128& x, const U128& y) {
+ return Inst<U128>(Opcode::SHA256MessageSchedule0, x, y);
+}
+
+U128 IREmitter::SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z) {
+ return Inst<U128>(Opcode::SHA256MessageSchedule1, x, y, z);
+}
+
+UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) {
+ ASSERT_MSG(esize * index < 128, "Invalid index");
+ switch (esize) {
+ case 8:
+ return Inst<U8>(Opcode::VectorGetElement8, a, Imm8(static_cast<u8>(index)));
+ case 16:
+ return Inst<U16>(Opcode::VectorGetElement16, a, Imm8(static_cast<u8>(index)));
+ case 32:
+ return Inst<U32>(Opcode::VectorGetElement32, a, Imm8(static_cast<u8>(index)));
+ case 64:
+ return Inst<U64>(Opcode::VectorGetElement64, a, Imm8(static_cast<u8>(index)));
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorSetElement(size_t esize, const U128& a, size_t index, const IR::UAny& elem) {
+ ASSERT_MSG(esize * index < 128, "Invalid index");
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSetElement8, a, Imm8(static_cast<u8>(index)), elem);
+ case 16:
+ return Inst<U128>(Opcode::VectorSetElement16, a, Imm8(static_cast<u8>(index)), elem);
+ case 32:
+ return Inst<U128>(Opcode::VectorSetElement32, a, Imm8(static_cast<u8>(index)), elem);
+ case 64:
+ return Inst<U128>(Opcode::VectorSetElement64, a, Imm8(static_cast<u8>(index)), elem);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorAbs(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorAbs8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorAbs16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorAbs32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorAbs64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorAdd(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorAdd8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorAdd16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorAdd32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorAdd64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorAnd(const U128& a, const U128& b) {
+ return Inst<U128>(Opcode::VectorAnd, a, b);
+}
+
+U128 IREmitter::VectorAndNot(const U128& a, const U128& b) {
+ return Inst<U128>(Opcode::VectorAndNot, a, b);
+}
+
+U128 IREmitter::VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorArithmeticShiftRight8, a, Imm8(shift_amount));
+ case 16:
+ return Inst<U128>(Opcode::VectorArithmeticShiftRight16, a, Imm8(shift_amount));
+ case 32:
+ return Inst<U128>(Opcode::VectorArithmeticShiftRight32, a, Imm8(shift_amount));
+ case 64:
+ return Inst<U128>(Opcode::VectorArithmeticShiftRight64, a, Imm8(shift_amount));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorArithmeticVShift(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorArithmeticVShift8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorArithmeticVShift16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorArithmeticVShift32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorArithmeticVShift64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorBroadcastLower(size_t esize, const UAny& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorBroadcastLower8, U8(a));
+ case 16:
+ return Inst<U128>(Opcode::VectorBroadcastLower16, U16(a));
+ case 32:
+ return Inst<U128>(Opcode::VectorBroadcastLower32, U32(a));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorBroadcast8, U8(a));
+ case 16:
+ return Inst<U128>(Opcode::VectorBroadcast16, U16(a));
+ case 32:
+ return Inst<U128>(Opcode::VectorBroadcast32, U32(a));
+ case 64:
+ return Inst<U128>(Opcode::VectorBroadcast64, U64(a));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorBroadcastElementLower(size_t esize, const U128& a, size_t index) {
+ ASSERT_MSG(esize * index < 128, "Invalid index");
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorBroadcastElementLower8, a, u8(index));
+ case 16:
+ return Inst<U128>(Opcode::VectorBroadcastElementLower16, a, u8(index));
+ case 32:
+ return Inst<U128>(Opcode::VectorBroadcastElementLower32, a, u8(index));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorBroadcastElement(size_t esize, const U128& a, size_t index) {
+ ASSERT_MSG(esize * index < 128, "Invalid index");
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorBroadcastElement8, a, u8(index));
+ case 16:
+ return Inst<U128>(Opcode::VectorBroadcastElement16, a, u8(index));
+ case 32:
+ return Inst<U128>(Opcode::VectorBroadcastElement32, a, u8(index));
+ case 64:
+ return Inst<U128>(Opcode::VectorBroadcastElement64, a, u8(index));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorCountLeadingZeros(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorCountLeadingZeros8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorCountLeadingZeros16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorCountLeadingZeros32, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorDeinterleaveEven8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorDeinterleaveEven16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorDeinterleaveEven32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorDeinterleaveEven64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorDeinterleaveOdd8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorDeinterleaveOdd16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorDeinterleaveOdd32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorDeinterleaveOdd64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorDeinterleaveEvenLower8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorDeinterleaveEvenLower16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorDeinterleaveEvenLower32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorDeinterleaveOddLower8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorDeinterleaveOddLower16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorDeinterleaveOddLower32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorEor(const U128& a, const U128& b) {
+ return Inst<U128>(Opcode::VectorEor, a, b);
+}
+
+U128 IREmitter::VectorEqual(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorEqual8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorEqual16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorEqual32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorEqual64, a, b);
+ case 128:
+ return Inst<U128>(Opcode::VectorEqual128, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorExtract(const U128& a, const U128& b, size_t position) {
+ ASSERT(position <= 128);
+ return Inst<U128>(Opcode::VectorExtract, a, b, Imm8(static_cast<u8>(position)));
+}
+
+U128 IREmitter::VectorExtractLower(const U128& a, const U128& b, size_t position) {
+ ASSERT(position <= 64);
+ return Inst<U128>(Opcode::VectorExtractLower, a, b, Imm8(static_cast<u8>(position)));
+}
+
+U128 IREmitter::VectorGreaterSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorGreaterS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorGreaterS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorGreaterS32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorGreaterS64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorGreaterEqualSigned(size_t esize, const U128& a, const U128& b) {
+ return VectorOr(VectorGreaterSigned(esize, a, b), VectorEqual(esize, a, b));
+}
+
+U128 IREmitter::VectorGreaterEqualUnsigned(size_t esize, const U128& a, const U128& b) {
+ return VectorEqual(esize, VectorMaxUnsigned(esize, a, b), a);
+}
+
+U128 IREmitter::VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b) {
+ return VectorNot(VectorEqual(esize, VectorMinUnsigned(esize, a, b), a));
+}
+
+U128 IREmitter::VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorHalvingAddS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorHalvingAddS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorHalvingAddS32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorHalvingAddU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorHalvingAddU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorHalvingAddU32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorHalvingSubS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorHalvingSubS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorHalvingSubS32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorHalvingSubU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorHalvingSubU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorHalvingSubU32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorInterleaveLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorInterleaveLower8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorInterleaveLower16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorInterleaveLower32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorInterleaveLower64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorInterleaveUpper(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorInterleaveUpper8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorInterleaveUpper16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorInterleaveUpper32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorInterleaveUpper64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorLessEqualSigned(size_t esize, const U128& a, const U128& b) {
+ return VectorNot(VectorGreaterSigned(esize, a, b));
+}
+
+U128 IREmitter::VectorLessEqualUnsigned(size_t esize, const U128& a, const U128& b) {
+ return VectorEqual(esize, VectorMinUnsigned(esize, a, b), a);
+}
+
+U128 IREmitter::VectorLessSigned(size_t esize, const U128& a, const U128& b) {
+ return VectorNot(VectorOr(VectorGreaterSigned(esize, a, b), VectorEqual(esize, a, b)));
+}
+
+U128 IREmitter::VectorLessUnsigned(size_t esize, const U128& a, const U128& b) {
+ return VectorNot(VectorEqual(esize, VectorMaxUnsigned(esize, a, b), a));
+}
+
+U128 IREmitter::VectorLogicalShiftLeft(size_t esize, const U128& a, u8 shift_amount) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorLogicalShiftLeft8, a, Imm8(shift_amount));
+ case 16:
+ return Inst<U128>(Opcode::VectorLogicalShiftLeft16, a, Imm8(shift_amount));
+ case 32:
+ return Inst<U128>(Opcode::VectorLogicalShiftLeft32, a, Imm8(shift_amount));
+ case 64:
+ return Inst<U128>(Opcode::VectorLogicalShiftLeft64, a, Imm8(shift_amount));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorLogicalShiftRight(size_t esize, const U128& a, u8 shift_amount) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorLogicalShiftRight8, a, Imm8(shift_amount));
+ case 16:
+ return Inst<U128>(Opcode::VectorLogicalShiftRight16, a, Imm8(shift_amount));
+ case 32:
+ return Inst<U128>(Opcode::VectorLogicalShiftRight32, a, Imm8(shift_amount));
+ case 64:
+ return Inst<U128>(Opcode::VectorLogicalShiftRight64, a, Imm8(shift_amount));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorLogicalVShift(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorLogicalVShift8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorLogicalVShift16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorLogicalVShift32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorLogicalVShift64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMaxSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMaxS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMaxS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMaxS32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorMaxS64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMaxUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMaxU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMaxU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMaxU32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorMaxU64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMinSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMinS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMinS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMinS32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorMinS64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMinUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMinU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMinU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMinU32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorMinU64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMultiply(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMultiply8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMultiply16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMultiply32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorMultiply64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMultiplySignedWiden8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMultiplySignedWiden16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMultiplySignedWiden32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorNarrow(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorNarrow16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorNarrow32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorNarrow64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorNot(const U128& a) {
+ return Inst<U128>(Opcode::VectorNot, a);
+}
+
+U128 IREmitter::VectorOr(const U128& a, const U128& b) {
+ return Inst<U128>(Opcode::VectorOr, a, b);
+}
+
+U128 IREmitter::VectorPairedAdd(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedAdd8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedAdd16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedAdd32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorPairedAdd64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorPairedAddLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedAddLower8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedAddLower16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedAddLower32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorPairedAddSignedWiden(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedAddSignedWiden8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedAddSignedWiden16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedAddSignedWiden32, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden32, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorPairedMaxSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMaxS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMaxS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMaxS32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMaxUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMaxU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMaxU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMaxU32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMinSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMinS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMinS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMinS32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMinUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMinU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMinU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMinU32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMaxSignedLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerS32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMaxUnsignedLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMaxLowerU32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMinSignedLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMinLowerS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMinLowerS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMinLowerS32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPairedMinUnsignedLower(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPairedMinLowerU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorPairedMinLowerU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorPairedMinLowerU32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPolynomialMultiply(const U128& a, const U128& b) {
+ return Inst<U128>(Opcode::VectorPolynomialMultiply8, a, b);
+}
+
+U128 IREmitter::VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorPolynomialMultiplyLong8, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorPolynomialMultiplyLong64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorPopulationCount(const U128& a) {
+ return Inst<U128>(Opcode::VectorPopulationCount, a);
+}
+
+U128 IREmitter::VectorReverseBits(const U128& a) {
+ return Inst<U128>(Opcode::VectorReverseBits, a);
+}
+
+U128 IREmitter::VectorReverseElementsInHalfGroups(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorReverseElementsInHalfGroups8, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorReverseElementsInWordGroups(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorReverseElementsInWordGroups8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorReverseElementsInWordGroups16, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorReverseElementsInLongGroups(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorReverseElementsInLongGroups8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorReverseElementsInLongGroups16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorReverseElementsInLongGroups32, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorReduceAdd(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorReduceAdd8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorReduceAdd16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorReduceAdd32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorReduceAdd64, a);
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorRotateLeft(size_t esize, const U128& a, u8 amount) {
+ ASSERT(amount < esize);
+
+ if (amount == 0) {
+ return a;
+ }
+
+ return VectorOr(VectorLogicalShiftLeft(esize, a, amount),
+ VectorLogicalShiftRight(esize, a, static_cast<u8>(esize - amount)));
+}
+
+U128 IREmitter::VectorRotateRight(size_t esize, const U128& a, u8 amount) {
+ ASSERT(amount < esize);
+
+ if (amount == 0) {
+ return a;
+ }
+
+ return VectorOr(VectorLogicalShiftRight(esize, a, amount),
+ VectorLogicalShiftLeft(esize, a, static_cast<u8>(esize - amount)));
+}
+
+U128 IREmitter::VectorRotateWholeVectorRight(const U128& a, u8 amount) {
+ ASSERT(amount % 32 == 0);
+ return Inst<U128>(Opcode::VectorRotateWholeVectorRight, a, Imm8(amount));
+}
+
+U128 IREmitter::VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddS32, a, b);
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorRoundingHalvingAddU32, a, b);
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorRoundingShiftLeftSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftS8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftS16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftS32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftS64, a, b);
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftU8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftU16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftU32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorRoundingShiftLeftU64, a, b);
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignExtend(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignExtend8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignExtend16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignExtend32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignExtend64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedAbsoluteDifference8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedAbsoluteDifference16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedAbsoluteDifference32, a, b);
+ }
+ UNREACHABLE();
+}
+
+UpperAndLower IREmitter::VectorSignedMultiply(size_t esize, const U128& a, const U128& b) {
+ const Value multiply = [&] {
+ switch (esize) {
+ case 16:
+ return Inst(Opcode::VectorSignedMultiply16, a, b);
+ case 32:
+ return Inst(Opcode::VectorSignedMultiply32, a, b);
+ }
+ UNREACHABLE();
+ }();
+
+ return {
+ Inst<U128>(Opcode::GetUpperFromOp, multiply),
+ Inst<U128>(Opcode::GetLowerFromOp, multiply),
+ };
+}
+
+U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAbs8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAbs16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAbs32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAbs64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedDoublingMultiplyHigh(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHigh16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHigh32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorSignedSaturatedDoublingMultiplyHighRounding(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyLong16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyLong32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedNeg(size_t esize, const U128& a) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNeg8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNeg16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNeg32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedNeg64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, u8 shift_amount) {
+ ASSERT(shift_amount < esize);
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned8, a, Imm8(shift_amount));
+ case 16:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned16, a, Imm8(shift_amount));
+ case 32:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned32, a, Imm8(shift_amount));
+ case 64:
+ return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned64, a, Imm8(shift_amount));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorSub8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorSub16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorSub32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorSub64, a, b);
+ }
+ UNREACHABLE();
+}
+
+Table IREmitter::VectorTable(std::vector<U64> values) {
+ ASSERT(values.size() >= 1 && values.size() <= 4);
+ values.resize(4);
+ return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
+}
+
+Table IREmitter::VectorTable(std::vector<U128> values) {
+ ASSERT(values.size() >= 1 && values.size() <= 4);
+ values.resize(4);
+ return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]);
+}
+
+U64 IREmitter::VectorTableLookup(const U64& defaults, const Table& table, const U64& indices) {
+ ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U64);
+ return Inst<U64>(Opcode::VectorTableLookup64, defaults, table, indices);
+}
+
+U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, const U128& indices) {
+ ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U128);
+ return Inst<U128>(Opcode::VectorTableLookup128, defaults, table, indices);
+}
+
+U128 IREmitter::VectorTranspose(size_t esize, const U128& a, const U128& b, bool part) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorTranspose8, a, b, Imm1(part));
+ case 16:
+ return Inst<U128>(Opcode::VectorTranspose16, a, b, Imm1(part));
+ case 32:
+ return Inst<U128>(Opcode::VectorTranspose32, a, b, Imm1(part));
+ case 64:
+ return Inst<U128>(Opcode::VectorTranspose64, a, b, Imm1(part));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference32, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorUnsignedRecipEstimate(const U128& a) {
+ return Inst<U128>(Opcode::VectorUnsignedRecipEstimate, a);
+}
+
+U128 IREmitter::VectorUnsignedRecipSqrtEstimate(const U128& a) {
+ return Inst<U128>(Opcode::VectorUnsignedRecipSqrtEstimate, a);
+}
+
+U128 IREmitter::VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorUnsignedSaturatedNarrow(size_t esize, const U128& a) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorUnsignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b) {
+ switch (esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft8, a, b);
+ case 16:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft16, a, b);
+ case 32:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft32, a, b);
+ case 64:
+ return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft64, a, b);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorZeroExtend(size_t original_esize, const U128& a) {
+ switch (original_esize) {
+ case 8:
+ return Inst<U128>(Opcode::VectorZeroExtend8, a);
+ case 16:
+ return Inst<U128>(Opcode::VectorZeroExtend16, a);
+ case 32:
+ return Inst<U128>(Opcode::VectorZeroExtend32, a);
+ case 64:
+ return Inst<U128>(Opcode::VectorZeroExtend64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::VectorZeroUpper(const U128& a) {
+ return Inst<U128>(Opcode::VectorZeroUpper, a);
+}
+
+U128 IREmitter::ZeroVector() {
+ return Inst<U128>(Opcode::ZeroVector);
+}
+
+U16U32U64 IREmitter::FPAbs(const U16U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPAbs16, a);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPAbs32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPAbs64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPAdd(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPAdd32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPAdd64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+NZCV IREmitter::FPCompare(const U32U64& a, const U32U64& b, bool exc_on_qnan) {
+ ASSERT(a.GetType() == b.GetType());
+
+ const IR::U1 exc_on_qnan_imm = Imm1(exc_on_qnan);
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<NZCV>(Opcode::FPCompare32, a, b, exc_on_qnan_imm);
+ case Type::U64:
+ return Inst<NZCV>(Opcode::FPCompare64, a, b, exc_on_qnan_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPDiv(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPDiv32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPDiv64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMax(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMax32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMax64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMaxNumeric(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMaxNumeric32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMaxNumeric64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMin(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMin32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMin64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMinNumeric(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMinNumeric32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMinNumeric64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMul(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMul32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMul64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPMulAdd(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPMulAdd16, a, b, c);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMulAdd32, a, b, c);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMulAdd64, a, b, c);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPMulSub(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPMulSub16, a, b, c);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMulSub32, a, b, c);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMulSub64, a, b, c);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPMulX32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPMulX64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPNeg(const U16U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPNeg16, a);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPNeg32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPNeg64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRecipEstimate(const U16U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRecipEstimate16, a);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRecipEstimate32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRecipEstimate64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRecipExponent(const U16U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRecipExponent16, a);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRecipExponent32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRecipExponent64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRecipStepFused(const U16U32U64& a, const U16U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRecipStepFused16, a, b);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRecipStepFused32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRecipStepFused64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRoundInt(const U16U32U64& a, FP::RoundingMode rounding, bool exact) {
+ const u8 rounding_value = static_cast<u8>(rounding);
+ const IR::U1 exact_imm = Imm1(exact);
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRoundInt16, a, rounding_value, exact_imm);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRoundInt32, a, rounding_value, exact_imm);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRoundInt64, a, rounding_value, exact_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRSqrtEstimate(const U16U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRSqrtEstimate16, a);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRSqrtEstimate32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRSqrtEstimate64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16U32U64 IREmitter::FPRSqrtStepFused(const U16U32U64& a, const U16U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPRSqrtStepFused16, a, b);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPRSqrtStepFused32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPRSqrtStepFused64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPSqrt(const U32U64& a) {
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPSqrt32, a);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPSqrt64, a);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32U64 IREmitter::FPSub(const U32U64& a, const U32U64& b) {
+ ASSERT(a.GetType() == b.GetType());
+
+ switch (a.GetType()) {
+ case Type::U32:
+ return Inst<U32>(Opcode::FPSub32, a, b);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPSub64, a, b);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16 IREmitter::FPDoubleToHalf(const U64& a, FP::RoundingMode rounding) {
+ return Inst<U16>(Opcode::FPDoubleToHalf, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U32 IREmitter::FPDoubleToSingle(const U64& a, FP::RoundingMode rounding) {
+ return Inst<U32>(Opcode::FPDoubleToSingle, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U64 IREmitter::FPHalfToDouble(const U16& a, FP::RoundingMode rounding) {
+ return Inst<U64>(Opcode::FPHalfToDouble, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U32 IREmitter::FPHalfToSingle(const U16& a, FP::RoundingMode rounding) {
+ return Inst<U32>(Opcode::FPHalfToSingle, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U64 IREmitter::FPSingleToDouble(const U32& a, FP::RoundingMode rounding) {
+ return Inst<U64>(Opcode::FPSingleToDouble, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U16 IREmitter::FPSingleToHalf(const U32& a, FP::RoundingMode rounding) {
+ return Inst<U16>(Opcode::FPSingleToHalf, a, Imm8(static_cast<u8>(rounding)));
+}
+
+U16 IREmitter::FPToFixedS16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 16);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPHalfToFixedS16, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U16>(Opcode::FPSingleToFixedS16, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U16>(Opcode::FPDoubleToFixedS16, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::FPToFixedS32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 32);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U32>(Opcode::FPHalfToFixedS32, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPSingleToFixedS32, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U32>(Opcode::FPDoubleToFixedS32, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U64 IREmitter::FPToFixedS64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 64);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U64>(Opcode::FPHalfToFixedS64, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U64>(Opcode::FPSingleToFixedS64, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPDoubleToFixedS64, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U16 IREmitter::FPToFixedU16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 16);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U16>(Opcode::FPHalfToFixedU16, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U16>(Opcode::FPSingleToFixedU16, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U16>(Opcode::FPDoubleToFixedU16, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::FPToFixedU32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 32);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U32>(Opcode::FPHalfToFixedU32, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPSingleToFixedU32, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U32>(Opcode::FPDoubleToFixedU32, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U64 IREmitter::FPToFixedU64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= 64);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U64>(Opcode::FPHalfToFixedU64, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U64>(Opcode::FPSingleToFixedU64, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPDoubleToFixedU64, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::FPSignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64)));
+
+ const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U32>(Opcode::FPFixedS16ToSingle, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPFixedS32ToSingle, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U32>(Opcode::FPFixedS64ToSingle, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U32 IREmitter::FPUnsignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64)));
+
+ const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U32>(Opcode::FPFixedU16ToSingle, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U32>(Opcode::FPFixedU32ToSingle, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U32>(Opcode::FPFixedU64ToSingle, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U64 IREmitter::FPSignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64)));
+
+ const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U64>(Opcode::FPFixedS16ToDouble, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U64>(Opcode::FPFixedS32ToDouble, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPFixedS64ToDouble, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U64 IREmitter::FPUnsignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) {
+ ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64)));
+
+ const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (a.GetType()) {
+ case Type::U16:
+ return Inst<U64>(Opcode::FPFixedU16ToDouble, a, fbits_imm, rounding_imm);
+ case Type::U32:
+ return Inst<U64>(Opcode::FPFixedU32ToDouble, a, fbits_imm, rounding_imm);
+ case Type::U64:
+ return Inst<U64>(Opcode::FPFixedU64ToDouble, a, fbits_imm, rounding_imm);
+ default:
+ UNREACHABLE();
+ }
+}
+
+U128 IREmitter::FPVectorAbs(size_t esize, const U128& a) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorAbs16, a);
+ case 32:
+ return Inst<U128>(Opcode::FPVectorAbs32, a);
+ case 64:
+ return Inst<U128>(Opcode::FPVectorAbs64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorAdd32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorAdd64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorDiv32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorDiv64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorEqual16, a, b, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorEqual32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorEqual64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(esize == 32);
+ return Inst<U128>(Opcode::FPVectorFromHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+}
+
+U128 IREmitter::FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(fbits <= esize);
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorFromSignedFixed32, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorFromSignedFixed64, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(fbits <= esize);
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorFromUnsignedFixed32, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorFromUnsignedFixed64, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorGreater32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorGreater64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorGreaterEqual32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorGreaterEqual64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMax32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMax64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMin32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMin64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMul32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMul64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMulAdd(size_t esize, const U128& a, const U128& b, const U128& c, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorMulAdd16, a, b, c, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMulAdd32, a, b, c, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMulAdd64, a, b, c, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorMulX32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorMulX64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorNeg(size_t esize, const U128& a) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorNeg16, a);
+ case 32:
+ return Inst<U128>(Opcode::FPVectorNeg32, a);
+ case 64:
+ return Inst<U128>(Opcode::FPVectorNeg64, a);
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorPairedAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorPairedAdd32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorPairedAdd64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorPairedAddLower32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorPairedAddLower64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorRecipEstimate(size_t esize, const U128& a, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorRecipEstimate16, a, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorRecipEstimate32, a, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorRecipEstimate64, a, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorRecipStepFused16, a, b, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorRecipStepFused32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorRecipStepFused64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorRoundInt(size_t esize, const U128& operand, FP::RoundingMode rounding, bool exact, bool fpcr_controlled) {
+ const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+ const IR::U1 exact_imm = Imm1(exact);
+
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorRoundInt16, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorRoundInt32, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorRoundInt64, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorRSqrtEstimate(size_t esize, const U128& a, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorRSqrtEstimate16, a, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorRSqrtEstimate32, a, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorRSqrtEstimate64, a, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorRSqrtStepFused16, a, b, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorRSqrtStepFused32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorRSqrtStepFused64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorSqrt32, a, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorSqrt64, a, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+ switch (esize) {
+ case 32:
+ return Inst<U128>(Opcode::FPVectorSub32, a, b, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorSub64, a, b, Imm1(fpcr_controlled));
+ }
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(esize == 32);
+ return Inst<U128>(Opcode::FPVectorToHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled));
+}
+
+U128 IREmitter::FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(fbits <= esize);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorToSignedFixed16, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorToSignedFixed32, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorToSignedFixed64, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ }
+
+ UNREACHABLE();
+}
+
+U128 IREmitter::FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) {
+ ASSERT(fbits <= esize);
+
+ const U8 fbits_imm = Imm8(static_cast<u8>(fbits));
+ const U8 rounding_imm = Imm8(static_cast<u8>(rounding));
+
+ switch (esize) {
+ case 16:
+ return Inst<U128>(Opcode::FPVectorToUnsignedFixed16, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ case 32:
+ return Inst<U128>(Opcode::FPVectorToUnsignedFixed32, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ case 64:
+ return Inst<U128>(Opcode::FPVectorToUnsignedFixed64, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled));
+ }
+
+ UNREACHABLE();
+}
+
+void IREmitter::Breakpoint() {
+ Inst(Opcode::Breakpoint);
+}
+
+void IREmitter::CallHostFunction(void (*fn)(void)) {
+ Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), Value{}, Value{}, Value{});
+}
+
+void IREmitter::CallHostFunction(void (*fn)(u64), const U64& arg1) {
+ Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, Value{}, Value{});
+}
+
+void IREmitter::CallHostFunction(void (*fn)(u64, u64), const U64& arg1, const U64& arg2) {
+ Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, arg2, Value{});
+}
+
+void IREmitter::CallHostFunction(void (*fn)(u64, u64, u64), const U64& arg1, const U64& arg2, const U64& arg3) {
+ Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, arg2, arg3);
+}
+
+void IREmitter::SetTerm(const Terminal& terminal) {
+ block.SetTerminal(terminal);
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/ir_emitter.h b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
new file mode 100644
index 0000000000..d37df24572
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
@@ -0,0 +1,432 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/acc_type.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/location_descriptor.h"
+#include "dynarmic/ir/terminal.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::FP {
+enum class RoundingMode;
+} // namespace Dynarmic::FP
+
+// ARM JIT Microinstruction Intermediate Representation
+//
+// This intermediate representation is an SSA IR. It is designed primarily for analysis,
+// though it can be lowered into a reduced form for interpretation. Each IR node (Value)
+// is a microinstruction of an idealised ARM CPU. The choice of microinstructions is made
+// not based on any existing microarchitecture but on ease of implementation.
+
+namespace Dynarmic::IR {
+
+enum class Opcode;
+
+template<typename T>
+struct ResultAndCarry {
+ T result;
+ U1 carry;
+};
+
+template<typename T>
+struct ResultAndOverflow {
+ T result;
+ U1 overflow;
+};
+
+template<typename T>
+struct ResultAndGE {
+ T result;
+ U32 ge;
+};
+
+struct UpperAndLower {
+ U128 upper;
+ U128 lower;
+};
+
+enum class MemOp {
+ LOAD,
+ STORE,
+ PREFETCH,
+};
+
+/**
+ * Convenience class to construct a basic block of the intermediate representation.
+ * `block` is the resulting block.
+ * The user of this class updates `current_location` as appropriate.
+ */
+class IREmitter {
+public:
+ explicit IREmitter(Block& block)
+ : block(block), insertion_point(block.end()) {}
+
+ Block& block;
+
+ U1 Imm1(bool value) const;
+ U8 Imm8(u8 value) const;
+ U16 Imm16(u16 value) const;
+ U32 Imm32(u32 value) const;
+ U64 Imm64(u64 value) const;
+
+ void PushRSB(const LocationDescriptor& return_location);
+
+ U64 Pack2x32To1x64(const U32& lo, const U32& hi);
+ U128 Pack2x64To1x128(const U64& lo, const U64& hi);
+ UAny LeastSignificant(size_t bitsize, const U32U64& value);
+ U32 LeastSignificantWord(const U64& value);
+ U16 LeastSignificantHalf(U32U64 value);
+ U8 LeastSignificantByte(U32U64 value);
+ ResultAndCarry<U32> MostSignificantWord(const U64& value);
+ U1 MostSignificantBit(const U32& value);
+ U1 IsZero(const U32& value);
+ U1 IsZero(const U64& value);
+ U1 IsZero(const U32U64& value);
+ U1 TestBit(const U32U64& value, const U8& bit);
+ U32 ConditionalSelect(Cond cond, const U32& a, const U32& b);
+ U64 ConditionalSelect(Cond cond, const U64& a, const U64& b);
+ NZCV ConditionalSelect(Cond cond, const NZCV& a, const NZCV& b);
+ U32U64 ConditionalSelect(Cond cond, const U32U64& a, const U32U64& b);
+
+ U1 GetCFlagFromNZCV(const NZCV& nzcv);
+ NZCV NZCVFromPackedFlags(const U32& a);
+ // This pseudo-instruction may only be added to instructions that support it.
+ NZCV NZCVFrom(const Value& value);
+
+ ResultAndCarry<U32> LogicalShiftLeft(const U32& value_in, const U8& shift_amount, const U1& carry_in);
+ ResultAndCarry<U32> LogicalShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in);
+ ResultAndCarry<U32> ArithmeticShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in);
+ ResultAndCarry<U32> RotateRight(const U32& value_in, const U8& shift_amount, const U1& carry_in);
+ U32U64 LogicalShiftLeft(const U32U64& value_in, const U8& shift_amount);
+ U32U64 LogicalShiftRight(const U32U64& value_in, const U8& shift_amount);
+ U32U64 ArithmeticShiftRight(const U32U64& value_in, const U8& shift_amount);
+ U32U64 RotateRight(const U32U64& value_in, const U8& shift_amount);
+ U32U64 LogicalShiftLeftMasked(const U32U64& value_in, const U32U64& shift_amount);
+ U32U64 LogicalShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount);
+ U32U64 ArithmeticShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount);
+ U32U64 RotateRightMasked(const U32U64& value_in, const U32U64& shift_amount);
+ ResultAndCarry<U32> RotateRightExtended(const U32& value_in, const U1& carry_in);
+ U32U64 AddWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in);
+ U32U64 SubWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in);
+ U32U64 Add(const U32U64& a, const U32U64& b);
+ U32U64 Sub(const U32U64& a, const U32U64& b);
+ U32U64 Mul(const U32U64& a, const U32U64& b);
+ U64 UnsignedMultiplyHigh(const U64& a, const U64& b);
+ U64 SignedMultiplyHigh(const U64& a, const U64& b);
+ U32U64 UnsignedDiv(const U32U64& a, const U32U64& b);
+ U32U64 SignedDiv(const U32U64& a, const U32U64& b);
+ U32U64 And(const U32U64& a, const U32U64& b);
+ U32U64 AndNot(const U32U64& a, const U32U64& b);
+ U32U64 Eor(const U32U64& a, const U32U64& b);
+ U32U64 Or(const U32U64& a, const U32U64& b);
+ U32U64 Not(const U32U64& a);
+ U32 SignExtendToWord(const UAny& a);
+ U64 SignExtendToLong(const UAny& a);
+ U32 SignExtendByteToWord(const U8& a);
+ U32 SignExtendHalfToWord(const U16& a);
+ U64 SignExtendWordToLong(const U32& a);
+ U32 ZeroExtendToWord(const UAny& a);
+ U64 ZeroExtendToLong(const UAny& a);
+ U128 ZeroExtendToQuad(const UAny& a);
+ U32 ZeroExtendByteToWord(const U8& a);
+ U32 ZeroExtendHalfToWord(const U16& a);
+ U64 ZeroExtendWordToLong(const U32& a);
+ U32 IndeterminateExtendToWord(const UAny& a);
+ U64 IndeterminateExtendToLong(const UAny& a);
+ U32 ByteReverseWord(const U32& a);
+ U16 ByteReverseHalf(const U16& a);
+ U64 ByteReverseDual(const U64& a);
+ U32U64 CountLeadingZeros(const U32U64& a);
+ U32U64 ExtractRegister(const U32U64& a, const U32U64& b, const U8& lsb);
+ U32U64 ReplicateBit(const U32U64& a, u8 bit);
+ U32U64 MaxSigned(const U32U64& a, const U32U64& b);
+ U32U64 MaxUnsigned(const U32U64& a, const U32U64& b);
+ U32U64 MinSigned(const U32U64& a, const U32U64& b);
+ U32U64 MinUnsigned(const U32U64& a, const U32U64& b);
+
+ ResultAndOverflow<U32> SignedSaturatedAddWithFlag(const U32& a, const U32& b);
+ ResultAndOverflow<U32> SignedSaturatedSubWithFlag(const U32& a, const U32& b);
+ ResultAndOverflow<U32> SignedSaturation(const U32& a, size_t bit_size_to_saturate_to);
+ ResultAndOverflow<U32> UnsignedSaturation(const U32& a, size_t bit_size_to_saturate_to);
+
+ UAny SignedSaturatedAdd(const UAny& a, const UAny& b);
+ UAny SignedSaturatedDoublingMultiplyReturnHigh(const UAny& a, const UAny& b);
+ UAny SignedSaturatedSub(const UAny& a, const UAny& b);
+ UAny UnsignedSaturatedAdd(const UAny& a, const UAny& b);
+ UAny UnsignedSaturatedSub(const UAny& a, const UAny& b);
+
+ U128 VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b);
+ U128 VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b);
+ U128 VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b);
+
+ ResultAndGE<U32> PackedAddU8(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedAddS8(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedAddU16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedAddS16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubU8(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubS8(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubU16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubS16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedAddSubU16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedAddSubS16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubAddU16(const U32& a, const U32& b);
+ ResultAndGE<U32> PackedSubAddS16(const U32& a, const U32& b);
+ U32 PackedHalvingAddU8(const U32& a, const U32& b);
+ U32 PackedHalvingAddS8(const U32& a, const U32& b);
+ U32 PackedHalvingSubU8(const U32& a, const U32& b);
+ U32 PackedHalvingSubS8(const U32& a, const U32& b);
+ U32 PackedHalvingAddU16(const U32& a, const U32& b);
+ U32 PackedHalvingAddS16(const U32& a, const U32& b);
+ U32 PackedHalvingSubU16(const U32& a, const U32& b);
+ U32 PackedHalvingSubS16(const U32& a, const U32& b);
+ U32 PackedHalvingAddSubU16(const U32& a, const U32& b);
+ U32 PackedHalvingAddSubS16(const U32& a, const U32& b);
+ U32 PackedHalvingSubAddU16(const U32& a, const U32& b);
+ U32 PackedHalvingSubAddS16(const U32& a, const U32& b);
+ U32 PackedSaturatedAddU8(const U32& a, const U32& b);
+ U32 PackedSaturatedAddS8(const U32& a, const U32& b);
+ U32 PackedSaturatedSubU8(const U32& a, const U32& b);
+ U32 PackedSaturatedSubS8(const U32& a, const U32& b);
+ U32 PackedSaturatedAddU16(const U32& a, const U32& b);
+ U32 PackedSaturatedAddS16(const U32& a, const U32& b);
+ U32 PackedSaturatedSubU16(const U32& a, const U32& b);
+ U32 PackedSaturatedSubS16(const U32& a, const U32& b);
+ U32 PackedAbsDiffSumU8(const U32& a, const U32& b);
+ U32 PackedSelect(const U32& ge, const U32& a, const U32& b);
+
+ U32 CRC32Castagnoli8(const U32& a, const U32& b);
+ U32 CRC32Castagnoli16(const U32& a, const U32& b);
+ U32 CRC32Castagnoli32(const U32& a, const U32& b);
+ U32 CRC32Castagnoli64(const U32& a, const U64& b);
+ U32 CRC32ISO8(const U32& a, const U32& b);
+ U32 CRC32ISO16(const U32& a, const U32& b);
+ U32 CRC32ISO32(const U32& a, const U32& b);
+ U32 CRC32ISO64(const U32& a, const U64& b);
+
+ U128 AESDecryptSingleRound(const U128& a);
+ U128 AESEncryptSingleRound(const U128& a);
+ U128 AESInverseMixColumns(const U128& a);
+ U128 AESMixColumns(const U128& a);
+
+ U8 SM4AccessSubstitutionBox(const U8& a);
+
+ U128 SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1);
+ U128 SHA256MessageSchedule0(const U128& x, const U128& y);
+ U128 SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z);
+
+ UAny VectorGetElement(size_t esize, const U128& a, size_t index);
+ U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem);
+ U128 VectorAbs(size_t esize, const U128& a);
+ U128 VectorAdd(size_t esize, const U128& a, const U128& b);
+ U128 VectorAnd(const U128& a, const U128& b);
+ U128 VectorAndNot(const U128& a, const U128& b);
+ U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount);
+ U128 VectorArithmeticVShift(size_t esize, const U128& a, const U128& b);
+ U128 VectorBroadcast(size_t esize, const UAny& a);
+ U128 VectorBroadcastLower(size_t esize, const UAny& a);
+ U128 VectorBroadcastElement(size_t esize, const U128& a, size_t index);
+ U128 VectorBroadcastElementLower(size_t esize, const U128& a, size_t index);
+ U128 VectorCountLeadingZeros(size_t esize, const U128& a);
+ U128 VectorEor(const U128& a, const U128& b);
+ U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b);
+ U128 VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b);
+ U128 VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorEqual(size_t esize, const U128& a, const U128& b);
+ U128 VectorExtract(const U128& a, const U128& b, size_t position);
+ U128 VectorExtractLower(const U128& a, const U128& b, size_t position);
+ U128 VectorGreaterEqualSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorGreaterEqualUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorGreaterSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorInterleaveLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorInterleaveUpper(size_t esize, const U128& a, const U128& b);
+ U128 VectorLessEqualSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorLessEqualUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorLessSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorLessUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorLogicalShiftLeft(size_t esize, const U128& a, u8 shift_amount);
+ U128 VectorLogicalShiftRight(size_t esize, const U128& a, u8 shift_amount);
+ U128 VectorLogicalVShift(size_t esize, const U128& a, const U128& b);
+ U128 VectorMaxSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorMaxUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorMinSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorMinUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorMultiply(size_t esize, const U128& a, const U128& b);
+ U128 VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b);
+ U128 VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b);
+ U128 VectorNarrow(size_t original_esize, const U128& a);
+ U128 VectorNot(const U128& a);
+ U128 VectorOr(const U128& a, const U128& b);
+ U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a);
+ U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a);
+ U128 VectorPairedMaxSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMaxUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMinSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMinUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMaxSignedLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMaxUnsignedLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMinSignedLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorPairedMinUnsignedLower(size_t esize, const U128& a, const U128& b);
+ U128 VectorPolynomialMultiply(const U128& a, const U128& b);
+ U128 VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b);
+ U128 VectorPopulationCount(const U128& a);
+ U128 VectorReverseBits(const U128& a);
+ U128 VectorReverseElementsInHalfGroups(size_t esize, const U128& a);
+ U128 VectorReverseElementsInWordGroups(size_t esize, const U128& a);
+ U128 VectorReverseElementsInLongGroups(size_t esize, const U128& a);
+ U128 VectorReduceAdd(size_t esize, const U128& a);
+ U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount);
+ U128 VectorRotateRight(size_t esize, const U128& a, u8 amount);
+ U128 VectorRotateWholeVectorRight(const U128& a, u8 amount);
+ U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorRoundingShiftLeftSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignExtend(size_t original_esize, const U128& a);
+ U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
+ UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedAbs(size_t esize, const U128& a);
+ U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedDoublingMultiplyHigh(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedDoublingMultiplyHighRounding(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a);
+ U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a);
+ U128 VectorSignedSaturatedNeg(size_t esize, const U128& a);
+ U128 VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b);
+ U128 VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, u8 shift_amount);
+ U128 VectorSub(size_t esize, const U128& a, const U128& b);
+ Table VectorTable(std::vector<U64> values);
+ Table VectorTable(std::vector<U128> values);
+ U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices);
+ U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices);
+ U128 VectorTranspose(size_t esize, const U128& a, const U128& b, bool part);
+ U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b);
+ U128 VectorUnsignedRecipEstimate(const U128& a);
+ U128 VectorUnsignedRecipSqrtEstimate(const U128& a);
+ U128 VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b);
+ U128 VectorUnsignedSaturatedNarrow(size_t esize, const U128& a);
+ U128 VectorUnsignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b);
+ U128 VectorZeroExtend(size_t original_esize, const U128& a);
+ U128 VectorZeroUpper(const U128& a);
+ U128 ZeroVector();
+
+ U16U32U64 FPAbs(const U16U32U64& a);
+ U32U64 FPAdd(const U32U64& a, const U32U64& b);
+ NZCV FPCompare(const U32U64& a, const U32U64& b, bool exc_on_qnan);
+ U32U64 FPDiv(const U32U64& a, const U32U64& b);
+ U32U64 FPMax(const U32U64& a, const U32U64& b);
+ U32U64 FPMaxNumeric(const U32U64& a, const U32U64& b);
+ U32U64 FPMin(const U32U64& a, const U32U64& b);
+ U32U64 FPMinNumeric(const U32U64& a, const U32U64& b);
+ U32U64 FPMul(const U32U64& a, const U32U64& b);
+ U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2);
+ U16U32U64 FPMulSub(const U16U32U64& minuend, const U16U32U64& op1, const U16U32U64& op2);
+ U32U64 FPMulX(const U32U64& a, const U32U64& b);
+ U16U32U64 FPNeg(const U16U32U64& a);
+ U16U32U64 FPRecipEstimate(const U16U32U64& a);
+ U16U32U64 FPRecipExponent(const U16U32U64& a);
+ U16U32U64 FPRecipStepFused(const U16U32U64& a, const U16U32U64& b);
+ U16U32U64 FPRoundInt(const U16U32U64& a, FP::RoundingMode rounding, bool exact);
+ U16U32U64 FPRSqrtEstimate(const U16U32U64& a);
+ U16U32U64 FPRSqrtStepFused(const U16U32U64& a, const U16U32U64& b);
+ U32U64 FPSqrt(const U32U64& a);
+ U32U64 FPSub(const U32U64& a, const U32U64& b);
+ U16 FPDoubleToHalf(const U64& a, FP::RoundingMode rounding);
+ U32 FPDoubleToSingle(const U64& a, FP::RoundingMode rounding);
+ U64 FPHalfToDouble(const U16& a, FP::RoundingMode rounding);
+ U32 FPHalfToSingle(const U16& a, FP::RoundingMode rounding);
+ U16 FPSingleToHalf(const U32& a, FP::RoundingMode rounding);
+ U64 FPSingleToDouble(const U32& a, FP::RoundingMode rounding);
+ U16 FPToFixedS16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U32 FPToFixedS32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U64 FPToFixedS64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U16 FPToFixedU16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U32 FPToFixedU32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U64 FPToFixedU64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U32 FPSignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U32 FPUnsignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U64 FPSignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+ U64 FPUnsignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding);
+
+ U128 FPVectorAbs(size_t esize, const U128& a);
+ U128 FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true);
+ U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
+ U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
+ U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true);
+ U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorNeg(size_t esize, const U128& a);
+ U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorRecipEstimate(size_t esize, const U128& a, bool fpcr_controlled = true);
+ U128 FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorRoundInt(size_t esize, const U128& operand, FP::RoundingMode rounding, bool exact, bool fpcr_controlled = true);
+ U128 FPVectorRSqrtEstimate(size_t esize, const U128& a, bool fpcr_controlled = true);
+ U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled = true);
+ U128 FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+ U128 FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true);
+ U128 FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
+ U128 FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true);
+
+ void Breakpoint();
+ void CallHostFunction(void (*fn)(void));
+ void CallHostFunction(void (*fn)(u64), const U64& arg1);
+ void CallHostFunction(void (*fn)(u64, u64), const U64& arg1, const U64& arg2);
+ void CallHostFunction(void (*fn)(u64, u64, u64), const U64& arg1, const U64& arg2, const U64& arg3);
+
+ void SetTerm(const Terminal& terminal);
+
+ void SetInsertionPointBefore(IR::Inst* new_insertion_point) {
+ insertion_point = IR::Block::iterator{*new_insertion_point};
+ }
+
+ void SetInsertionPointBefore(IR::Block::iterator new_insertion_point) {
+ insertion_point = new_insertion_point;
+ }
+
+ void SetInsertionPointAfter(IR::Inst* new_insertion_point) {
+ insertion_point = IR::Block::iterator{*new_insertion_point};
+ ++insertion_point;
+ }
+
+ void SetInsertionPointAfter(IR::Block::iterator new_insertion_point) {
+ insertion_point = new_insertion_point;
+ ++insertion_point;
+ }
+
+protected:
+ IR::Block::iterator insertion_point;
+
+ template<typename T = Value, typename... Args>
+ T Inst(Opcode op, Args... args) {
+ auto iter = block.PrependNewInst(insertion_point, op, {Value(args)...});
+ return T(Value(&*iter));
+ }
+};
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp b/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp
new file mode 100644
index 0000000000..e7e640561d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp
@@ -0,0 +1,16 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/location_descriptor.h"
+
+#include <fmt/format.h>
+
+namespace Dynarmic::IR {
+
+std::string ToString(const LocationDescriptor& descriptor) {
+ return fmt::format("{{{:016x}}}", descriptor.Value());
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/location_descriptor.h b/externals/dynarmic/src/dynarmic/ir/location_descriptor.h
new file mode 100644
index 0000000000..48e5e32bb1
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/location_descriptor.h
@@ -0,0 +1,64 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include <fmt/format.h>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::IR {
+
+class LocationDescriptor {
+public:
+ explicit LocationDescriptor(u64 value)
+ : value(value) {}
+
+ bool operator==(const LocationDescriptor& o) const {
+ return value == o.Value();
+ }
+
+ bool operator!=(const LocationDescriptor& o) const {
+ return !operator==(o);
+ }
+
+ u64 Value() const { return value; }
+
+private:
+ u64 value;
+};
+
+std::string ToString(const LocationDescriptor& descriptor);
+
+inline bool operator<(const LocationDescriptor& x, const LocationDescriptor& y) noexcept {
+ return x.Value() < y.Value();
+}
+
+} // namespace Dynarmic::IR
+
+namespace std {
+template<>
+struct less<Dynarmic::IR::LocationDescriptor> {
+ bool operator()(const Dynarmic::IR::LocationDescriptor& x, const Dynarmic::IR::LocationDescriptor& y) const noexcept {
+ return x < y;
+ }
+};
+template<>
+struct hash<Dynarmic::IR::LocationDescriptor> {
+ size_t operator()(const Dynarmic::IR::LocationDescriptor& x) const noexcept {
+ return std::hash<u64>()(x.Value());
+ }
+};
+} // namespace std
+
+template<>
+struct fmt::formatter<Dynarmic::IR::LocationDescriptor> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::IR::LocationDescriptor descriptor, FormatContext& ctx) const {
+ return formatter<std::string>::format(ToString(descriptor), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp b/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp
new file mode 100644
index 0000000000..50af036ddf
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp
@@ -0,0 +1,712 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/microinstruction.h"
+
+#include <algorithm>
+
+#include <mcl/assert.hpp>
+
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::IR {
+
+bool Inst::IsArithmeticShift() const {
+ return op == Opcode::ArithmeticShiftRight32
+ || op == Opcode::ArithmeticShiftRight64;
+}
+
+bool Inst::IsCircularShift() const {
+ return op == Opcode::RotateRight32
+ || op == Opcode::RotateRight64
+ || op == Opcode::RotateRightExtended;
+}
+
+bool Inst::IsLogicalShift() const {
+ switch (op) {
+ case Opcode::LogicalShiftLeft32:
+ case Opcode::LogicalShiftLeft64:
+ case Opcode::LogicalShiftRight32:
+ case Opcode::LogicalShiftRight64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsShift() const {
+ return IsArithmeticShift()
+ || IsCircularShift()
+ || IsLogicalShift();
+}
+
+bool Inst::IsBarrier() const {
+ switch (op) {
+ case Opcode::A32DataMemoryBarrier:
+ case Opcode::A32DataSynchronizationBarrier:
+ case Opcode::A32InstructionSynchronizationBarrier:
+ case Opcode::A64DataMemoryBarrier:
+ case Opcode::A64DataSynchronizationBarrier:
+ case Opcode::A64InstructionSynchronizationBarrier:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsSharedMemoryRead() const {
+ switch (op) {
+ case Opcode::A32ReadMemory8:
+ case Opcode::A32ReadMemory16:
+ case Opcode::A32ReadMemory32:
+ case Opcode::A32ReadMemory64:
+ case Opcode::A64ReadMemory8:
+ case Opcode::A64ReadMemory16:
+ case Opcode::A64ReadMemory32:
+ case Opcode::A64ReadMemory64:
+ case Opcode::A64ReadMemory128:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsSharedMemoryWrite() const {
+ switch (op) {
+ case Opcode::A32WriteMemory8:
+ case Opcode::A32WriteMemory16:
+ case Opcode::A32WriteMemory32:
+ case Opcode::A32WriteMemory64:
+ case Opcode::A64WriteMemory8:
+ case Opcode::A64WriteMemory16:
+ case Opcode::A64WriteMemory32:
+ case Opcode::A64WriteMemory64:
+ case Opcode::A64WriteMemory128:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsSharedMemoryReadOrWrite() const {
+ return IsSharedMemoryRead()
+ || IsSharedMemoryWrite();
+}
+
+bool Inst::IsExclusiveMemoryRead() const {
+ switch (op) {
+ case Opcode::A32ExclusiveReadMemory8:
+ case Opcode::A32ExclusiveReadMemory16:
+ case Opcode::A32ExclusiveReadMemory32:
+ case Opcode::A32ExclusiveReadMemory64:
+ case Opcode::A64ExclusiveReadMemory8:
+ case Opcode::A64ExclusiveReadMemory16:
+ case Opcode::A64ExclusiveReadMemory32:
+ case Opcode::A64ExclusiveReadMemory64:
+ case Opcode::A64ExclusiveReadMemory128:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsExclusiveMemoryWrite() const {
+ switch (op) {
+ case Opcode::A32ExclusiveWriteMemory8:
+ case Opcode::A32ExclusiveWriteMemory16:
+ case Opcode::A32ExclusiveWriteMemory32:
+ case Opcode::A32ExclusiveWriteMemory64:
+ case Opcode::A64ExclusiveWriteMemory8:
+ case Opcode::A64ExclusiveWriteMemory16:
+ case Opcode::A64ExclusiveWriteMemory32:
+ case Opcode::A64ExclusiveWriteMemory64:
+ case Opcode::A64ExclusiveWriteMemory128:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsMemoryRead() const {
+ return IsSharedMemoryRead()
+ || IsExclusiveMemoryRead();
+}
+
+bool Inst::IsMemoryWrite() const {
+ return IsSharedMemoryWrite()
+ || IsExclusiveMemoryWrite();
+}
+
+bool Inst::IsMemoryReadOrWrite() const {
+ return IsMemoryRead()
+ || IsMemoryWrite();
+}
+
+bool Inst::ReadsFromCPSR() const {
+ switch (op) {
+ case Opcode::A32GetCpsr:
+ case Opcode::A32GetCFlag:
+ case Opcode::A32GetGEFlags:
+ case Opcode::A32UpdateUpperLocationDescriptor:
+ case Opcode::A64GetCFlag:
+ case Opcode::A64GetNZCVRaw:
+ case Opcode::ConditionalSelect32:
+ case Opcode::ConditionalSelect64:
+ case Opcode::ConditionalSelectNZCV:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::WritesToCPSR() const {
+ switch (op) {
+ case Opcode::A32SetCpsr:
+ case Opcode::A32SetCpsrNZCVRaw:
+ case Opcode::A32SetCpsrNZCV:
+ case Opcode::A32SetCpsrNZCVQ:
+ case Opcode::A32SetCpsrNZ:
+ case Opcode::A32SetCpsrNZC:
+ case Opcode::A32OrQFlag:
+ case Opcode::A32SetGEFlags:
+ case Opcode::A32SetGEFlagsCompressed:
+ case Opcode::A32UpdateUpperLocationDescriptor:
+ case Opcode::A64SetNZCVRaw:
+ case Opcode::A64SetNZCV:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::WritesToSystemRegister() const {
+ switch (op) {
+ case Opcode::A64SetTPIDR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool Inst::ReadsFromCoreRegister() const {
+ switch (op) {
+ case Opcode::A32GetRegister:
+ case Opcode::A32GetExtendedRegister32:
+ case Opcode::A32GetExtendedRegister64:
+ case Opcode::A32GetVector:
+ case Opcode::A64GetW:
+ case Opcode::A64GetX:
+ case Opcode::A64GetS:
+ case Opcode::A64GetD:
+ case Opcode::A64GetQ:
+ case Opcode::A64GetSP:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::WritesToCoreRegister() const {
+ switch (op) {
+ case Opcode::A32SetRegister:
+ case Opcode::A32SetExtendedRegister32:
+ case Opcode::A32SetExtendedRegister64:
+ case Opcode::A32SetVector:
+ case Opcode::A32BXWritePC:
+ case Opcode::A64SetW:
+ case Opcode::A64SetX:
+ case Opcode::A64SetS:
+ case Opcode::A64SetD:
+ case Opcode::A64SetQ:
+ case Opcode::A64SetSP:
+ case Opcode::A64SetPC:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::ReadsFromFPCR() const {
+ switch (op) {
+ case Opcode::A32GetFpscr:
+ case Opcode::A32GetFpscrNZCV:
+ case Opcode::A64GetFPCR:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::WritesToFPCR() const {
+ switch (op) {
+ case Opcode::A32SetFpscr:
+ case Opcode::A32SetFpscrNZCV:
+ case Opcode::A64SetFPCR:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::ReadsFromFPSR() const {
+ return op == Opcode::A32GetFpscr
+ || op == Opcode::A32GetFpscrNZCV
+ || op == Opcode::A64GetFPSR
+ || ReadsFromFPSRCumulativeExceptionBits()
+ || ReadsFromFPSRCumulativeSaturationBit();
+}
+
+bool Inst::WritesToFPSR() const {
+ return op == Opcode::A32SetFpscr
+ || op == Opcode::A32SetFpscrNZCV
+ || op == Opcode::A64SetFPSR
+ || WritesToFPSRCumulativeExceptionBits()
+ || WritesToFPSRCumulativeSaturationBit();
+}
+
+bool Inst::ReadsFromFPSRCumulativeExceptionBits() const {
+ return ReadsFromAndWritesToFPSRCumulativeExceptionBits();
+}
+
+bool Inst::WritesToFPSRCumulativeExceptionBits() const {
+ return ReadsFromAndWritesToFPSRCumulativeExceptionBits();
+}
+
+bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const {
+ switch (op) {
+ case Opcode::FPAdd32:
+ case Opcode::FPAdd64:
+ case Opcode::FPCompare32:
+ case Opcode::FPCompare64:
+ case Opcode::FPDiv32:
+ case Opcode::FPDiv64:
+ case Opcode::FPMax32:
+ case Opcode::FPMax64:
+ case Opcode::FPMaxNumeric32:
+ case Opcode::FPMaxNumeric64:
+ case Opcode::FPMin32:
+ case Opcode::FPMin64:
+ case Opcode::FPMinNumeric32:
+ case Opcode::FPMinNumeric64:
+ case Opcode::FPMul32:
+ case Opcode::FPMul64:
+ case Opcode::FPMulAdd16:
+ case Opcode::FPMulAdd32:
+ case Opcode::FPMulAdd64:
+ case Opcode::FPMulSub16:
+ case Opcode::FPMulSub32:
+ case Opcode::FPMulSub64:
+ case Opcode::FPRecipEstimate16:
+ case Opcode::FPRecipEstimate32:
+ case Opcode::FPRecipEstimate64:
+ case Opcode::FPRecipExponent16:
+ case Opcode::FPRecipExponent32:
+ case Opcode::FPRecipExponent64:
+ case Opcode::FPRecipStepFused16:
+ case Opcode::FPRecipStepFused32:
+ case Opcode::FPRecipStepFused64:
+ case Opcode::FPRoundInt16:
+ case Opcode::FPRoundInt32:
+ case Opcode::FPRoundInt64:
+ case Opcode::FPRSqrtEstimate16:
+ case Opcode::FPRSqrtEstimate32:
+ case Opcode::FPRSqrtEstimate64:
+ case Opcode::FPRSqrtStepFused16:
+ case Opcode::FPRSqrtStepFused32:
+ case Opcode::FPRSqrtStepFused64:
+ case Opcode::FPSqrt32:
+ case Opcode::FPSqrt64:
+ case Opcode::FPSub32:
+ case Opcode::FPSub64:
+ case Opcode::FPHalfToDouble:
+ case Opcode::FPHalfToSingle:
+ case Opcode::FPSingleToDouble:
+ case Opcode::FPSingleToHalf:
+ case Opcode::FPDoubleToHalf:
+ case Opcode::FPDoubleToSingle:
+ case Opcode::FPDoubleToFixedS32:
+ case Opcode::FPDoubleToFixedS64:
+ case Opcode::FPDoubleToFixedU32:
+ case Opcode::FPDoubleToFixedU64:
+ case Opcode::FPHalfToFixedS32:
+ case Opcode::FPHalfToFixedS64:
+ case Opcode::FPHalfToFixedU32:
+ case Opcode::FPHalfToFixedU64:
+ case Opcode::FPSingleToFixedS32:
+ case Opcode::FPSingleToFixedS64:
+ case Opcode::FPSingleToFixedU32:
+ case Opcode::FPSingleToFixedU64:
+ case Opcode::FPFixedU32ToSingle:
+ case Opcode::FPFixedS32ToSingle:
+ case Opcode::FPFixedU32ToDouble:
+ case Opcode::FPFixedU64ToDouble:
+ case Opcode::FPFixedU64ToSingle:
+ case Opcode::FPFixedS32ToDouble:
+ case Opcode::FPFixedS64ToDouble:
+ case Opcode::FPFixedS64ToSingle:
+ case Opcode::FPVectorAdd32:
+ case Opcode::FPVectorAdd64:
+ case Opcode::FPVectorDiv32:
+ case Opcode::FPVectorDiv64:
+ case Opcode::FPVectorEqual16:
+ case Opcode::FPVectorEqual32:
+ case Opcode::FPVectorEqual64:
+ case Opcode::FPVectorFromSignedFixed32:
+ case Opcode::FPVectorFromSignedFixed64:
+ case Opcode::FPVectorFromUnsignedFixed32:
+ case Opcode::FPVectorFromUnsignedFixed64:
+ case Opcode::FPVectorGreater32:
+ case Opcode::FPVectorGreater64:
+ case Opcode::FPVectorGreaterEqual32:
+ case Opcode::FPVectorGreaterEqual64:
+ case Opcode::FPVectorMul32:
+ case Opcode::FPVectorMul64:
+ case Opcode::FPVectorMulAdd16:
+ case Opcode::FPVectorMulAdd32:
+ case Opcode::FPVectorMulAdd64:
+ case Opcode::FPVectorPairedAddLower32:
+ case Opcode::FPVectorPairedAddLower64:
+ case Opcode::FPVectorPairedAdd32:
+ case Opcode::FPVectorPairedAdd64:
+ case Opcode::FPVectorRecipEstimate16:
+ case Opcode::FPVectorRecipEstimate32:
+ case Opcode::FPVectorRecipEstimate64:
+ case Opcode::FPVectorRecipStepFused16:
+ case Opcode::FPVectorRecipStepFused32:
+ case Opcode::FPVectorRecipStepFused64:
+ case Opcode::FPVectorRoundInt16:
+ case Opcode::FPVectorRoundInt32:
+ case Opcode::FPVectorRoundInt64:
+ case Opcode::FPVectorRSqrtEstimate16:
+ case Opcode::FPVectorRSqrtEstimate32:
+ case Opcode::FPVectorRSqrtEstimate64:
+ case Opcode::FPVectorRSqrtStepFused16:
+ case Opcode::FPVectorRSqrtStepFused32:
+ case Opcode::FPVectorRSqrtStepFused64:
+ case Opcode::FPVectorSqrt32:
+ case Opcode::FPVectorSqrt64:
+ case Opcode::FPVectorSub32:
+ case Opcode::FPVectorSub64:
+ case Opcode::FPVectorToSignedFixed16:
+ case Opcode::FPVectorToSignedFixed32:
+ case Opcode::FPVectorToSignedFixed64:
+ case Opcode::FPVectorToUnsignedFixed16:
+ case Opcode::FPVectorToUnsignedFixed32:
+ case Opcode::FPVectorToUnsignedFixed64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::ReadsFromFPSRCumulativeSaturationBit() const {
+ return false;
+}
+
+bool Inst::WritesToFPSRCumulativeSaturationBit() const {
+ switch (op) {
+ case Opcode::SignedSaturatedAdd8:
+ case Opcode::SignedSaturatedAdd16:
+ case Opcode::SignedSaturatedAdd32:
+ case Opcode::SignedSaturatedAdd64:
+ case Opcode::SignedSaturatedDoublingMultiplyReturnHigh16:
+ case Opcode::SignedSaturatedDoublingMultiplyReturnHigh32:
+ case Opcode::SignedSaturatedSub8:
+ case Opcode::SignedSaturatedSub16:
+ case Opcode::SignedSaturatedSub32:
+ case Opcode::SignedSaturatedSub64:
+ case Opcode::UnsignedSaturatedAdd8:
+ case Opcode::UnsignedSaturatedAdd16:
+ case Opcode::UnsignedSaturatedAdd32:
+ case Opcode::UnsignedSaturatedAdd64:
+ case Opcode::UnsignedSaturatedSub8:
+ case Opcode::UnsignedSaturatedSub16:
+ case Opcode::UnsignedSaturatedSub32:
+ case Opcode::UnsignedSaturatedSub64:
+ case Opcode::VectorSignedSaturatedAbs8:
+ case Opcode::VectorSignedSaturatedAbs16:
+ case Opcode::VectorSignedSaturatedAbs32:
+ case Opcode::VectorSignedSaturatedAbs64:
+ case Opcode::VectorSignedSaturatedAccumulateUnsigned8:
+ case Opcode::VectorSignedSaturatedAccumulateUnsigned16:
+ case Opcode::VectorSignedSaturatedAccumulateUnsigned32:
+ case Opcode::VectorSignedSaturatedAccumulateUnsigned64:
+ case Opcode::VectorSignedSaturatedAdd8:
+ case Opcode::VectorSignedSaturatedAdd16:
+ case Opcode::VectorSignedSaturatedAdd32:
+ case Opcode::VectorSignedSaturatedAdd64:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyHigh16:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyHigh32:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyLong16:
+ case Opcode::VectorSignedSaturatedDoublingMultiplyLong32:
+ case Opcode::VectorSignedSaturatedNarrowToSigned16:
+ case Opcode::VectorSignedSaturatedNarrowToSigned32:
+ case Opcode::VectorSignedSaturatedNarrowToSigned64:
+ case Opcode::VectorSignedSaturatedNarrowToUnsigned16:
+ case Opcode::VectorSignedSaturatedNarrowToUnsigned32:
+ case Opcode::VectorSignedSaturatedNarrowToUnsigned64:
+ case Opcode::VectorSignedSaturatedNeg8:
+ case Opcode::VectorSignedSaturatedNeg16:
+ case Opcode::VectorSignedSaturatedNeg32:
+ case Opcode::VectorSignedSaturatedNeg64:
+ case Opcode::VectorSignedSaturatedShiftLeft8:
+ case Opcode::VectorSignedSaturatedShiftLeft16:
+ case Opcode::VectorSignedSaturatedShiftLeft32:
+ case Opcode::VectorSignedSaturatedShiftLeft64:
+ case Opcode::VectorSignedSaturatedShiftLeftUnsigned8:
+ case Opcode::VectorSignedSaturatedShiftLeftUnsigned16:
+ case Opcode::VectorSignedSaturatedShiftLeftUnsigned32:
+ case Opcode::VectorSignedSaturatedShiftLeftUnsigned64:
+ case Opcode::VectorSignedSaturatedSub8:
+ case Opcode::VectorSignedSaturatedSub16:
+ case Opcode::VectorSignedSaturatedSub32:
+ case Opcode::VectorSignedSaturatedSub64:
+ case Opcode::VectorUnsignedSaturatedAccumulateSigned8:
+ case Opcode::VectorUnsignedSaturatedAccumulateSigned16:
+ case Opcode::VectorUnsignedSaturatedAccumulateSigned32:
+ case Opcode::VectorUnsignedSaturatedAccumulateSigned64:
+ case Opcode::VectorUnsignedSaturatedAdd8:
+ case Opcode::VectorUnsignedSaturatedAdd16:
+ case Opcode::VectorUnsignedSaturatedAdd32:
+ case Opcode::VectorUnsignedSaturatedAdd64:
+ case Opcode::VectorUnsignedSaturatedNarrow16:
+ case Opcode::VectorUnsignedSaturatedNarrow32:
+ case Opcode::VectorUnsignedSaturatedNarrow64:
+ case Opcode::VectorUnsignedSaturatedShiftLeft8:
+ case Opcode::VectorUnsignedSaturatedShiftLeft16:
+ case Opcode::VectorUnsignedSaturatedShiftLeft32:
+ case Opcode::VectorUnsignedSaturatedShiftLeft64:
+ case Opcode::VectorUnsignedSaturatedSub8:
+ case Opcode::VectorUnsignedSaturatedSub16:
+ case Opcode::VectorUnsignedSaturatedSub32:
+ case Opcode::VectorUnsignedSaturatedSub64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::CausesCPUException() const {
+ return op == Opcode::Breakpoint
+ || op == Opcode::A32CallSupervisor
+ || op == Opcode::A32ExceptionRaised
+ || op == Opcode::A64CallSupervisor
+ || op == Opcode::A64ExceptionRaised;
+}
+
+bool Inst::AltersExclusiveState() const {
+ return op == Opcode::A32ClearExclusive
+ || op == Opcode::A64ClearExclusive
+ || IsExclusiveMemoryRead()
+ || IsExclusiveMemoryWrite();
+}
+
+bool Inst::IsCoprocessorInstruction() const {
+ switch (op) {
+ case Opcode::A32CoprocInternalOperation:
+ case Opcode::A32CoprocSendOneWord:
+ case Opcode::A32CoprocSendTwoWords:
+ case Opcode::A32CoprocGetOneWord:
+ case Opcode::A32CoprocGetTwoWords:
+ case Opcode::A32CoprocLoadWords:
+ case Opcode::A32CoprocStoreWords:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::IsSetCheckBitOperation() const {
+ return op == Opcode::A32SetCheckBit
+ || op == Opcode::A64SetCheckBit;
+}
+
+bool Inst::MayHaveSideEffects() const {
+ return op == Opcode::PushRSB
+ || op == Opcode::CallHostFunction
+ || op == Opcode::A64DataCacheOperationRaised
+ || op == Opcode::A64InstructionCacheOperationRaised
+ || IsSetCheckBitOperation()
+ || IsBarrier()
+ || CausesCPUException()
+ || WritesToCoreRegister()
+ || WritesToSystemRegister()
+ || WritesToCPSR()
+ || WritesToFPCR()
+ || WritesToFPSR()
+ || AltersExclusiveState()
+ || IsMemoryWrite()
+ || IsCoprocessorInstruction();
+}
+
+bool Inst::IsAPseudoOperation() const {
+ switch (op) {
+ case Opcode::GetCarryFromOp:
+ case Opcode::GetOverflowFromOp:
+ case Opcode::GetGEFromOp:
+ case Opcode::GetNZCVFromOp:
+ case Opcode::GetNZFromOp:
+ case Opcode::GetUpperFromOp:
+ case Opcode::GetLowerFromOp:
+ case Opcode::MostSignificantBit:
+ case Opcode::IsZero32:
+ case Opcode::IsZero64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::MayGetNZCVFromOp() const {
+ switch (op) {
+ case Opcode::Add32:
+ case Opcode::Add64:
+ case Opcode::Sub32:
+ case Opcode::Sub64:
+ case Opcode::And32:
+ case Opcode::And64:
+ case Opcode::AndNot32:
+ case Opcode::AndNot64:
+ case Opcode::Eor32:
+ case Opcode::Eor64:
+ case Opcode::Or32:
+ case Opcode::Or64:
+ case Opcode::Not32:
+ case Opcode::Not64:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+bool Inst::AreAllArgsImmediates() const {
+ return std::all_of(args.begin(), args.begin() + NumArgs(), [](const auto& value) { return value.IsImmediate(); });
+}
+
+bool Inst::HasAssociatedPseudoOperation() const {
+ return next_pseudoop && !IsAPseudoOperation();
+}
+
+Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) {
+ Inst* pseudoop = next_pseudoop;
+ while (pseudoop) {
+ if (pseudoop->GetOpcode() == opcode) {
+ ASSERT(pseudoop->GetArg(0).GetInst() == this);
+ return pseudoop;
+ }
+ pseudoop = pseudoop->next_pseudoop;
+ }
+ return nullptr;
+}
+
+Type Inst::GetType() const {
+ if (op == Opcode::Identity)
+ return args[0].GetType();
+ return GetTypeOf(op);
+}
+
+size_t Inst::NumArgs() const {
+ return GetNumArgsOf(op);
+}
+
+Value Inst::GetArg(size_t index) const {
+ ASSERT_MSG(index < GetNumArgsOf(op), "Inst::GetArg: index {} >= number of arguments of {} ({})", index, op, GetNumArgsOf(op));
+ ASSERT_MSG(!args[index].IsEmpty() || GetArgTypeOf(op, index) == IR::Type::Opaque, "Inst::GetArg: index {} is empty", index, args[index].GetType());
+
+ return args[index];
+}
+
+void Inst::SetArg(size_t index, Value value) {
+ ASSERT_MSG(index < GetNumArgsOf(op), "Inst::SetArg: index {} >= number of arguments of {} ({})", index, op, GetNumArgsOf(op));
+ ASSERT_MSG(AreTypesCompatible(value.GetType(), GetArgTypeOf(op, index)), "Inst::SetArg: type {} of argument {} not compatible with operation {} ({})", value.GetType(), index, op, GetArgTypeOf(op, index));
+
+ if (!args[index].IsImmediate()) {
+ UndoUse(args[index]);
+ }
+ if (!value.IsImmediate()) {
+ Use(value);
+ }
+
+ args[index] = value;
+}
+
+void Inst::Invalidate() {
+ ClearArgs();
+ op = Opcode::Void;
+}
+
+void Inst::ClearArgs() {
+ for (auto& value : args) {
+ if (!value.IsImmediate()) {
+ UndoUse(value);
+ }
+ value = {};
+ }
+}
+
+void Inst::ReplaceUsesWith(Value replacement) {
+ Invalidate();
+
+ op = Opcode::Identity;
+
+ if (!replacement.IsImmediate()) {
+ Use(replacement);
+ }
+
+ args[0] = replacement;
+}
+
+void Inst::Use(const Value& value) {
+ value.GetInst()->use_count++;
+
+ if (IsAPseudoOperation()) {
+ if (op == Opcode::GetNZCVFromOp) {
+ ASSERT_MSG(value.GetInst()->MayGetNZCVFromOp(), "This value doesn't support the GetNZCVFromOp pseduo-op");
+ }
+
+ Inst* insert_point = value.GetInst();
+ while (insert_point->next_pseudoop) {
+ insert_point = insert_point->next_pseudoop;
+ DEBUG_ASSERT(insert_point->GetArg(0).GetInst() == value.GetInst());
+ }
+ insert_point->next_pseudoop = this;
+ }
+}
+
+void Inst::UndoUse(const Value& value) {
+ value.GetInst()->use_count--;
+
+ if (IsAPseudoOperation()) {
+ Inst* insert_point = value.GetInst();
+ while (insert_point->next_pseudoop != this) {
+ insert_point = insert_point->next_pseudoop;
+ DEBUG_ASSERT(insert_point->GetArg(0).GetInst() == value.GetInst());
+ }
+ insert_point->next_pseudoop = next_pseudoop;
+ next_pseudoop = nullptr;
+ }
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/microinstruction.h b/externals/dynarmic/src/dynarmic/ir/microinstruction.h
new file mode 100644
index 0000000000..26b6b899f3
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/microinstruction.h
@@ -0,0 +1,162 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+
+#include <mcl/container/intrusive_list.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::IR {
+
+enum class Opcode;
+enum class Type;
+
+constexpr size_t max_arg_count = 4;
+
+/**
+ * A representation of a microinstruction. A single ARM/Thumb instruction may be
+ * converted into zero or more microinstructions.
+ */
+class Inst final : public mcl::intrusive_list_node<Inst> {
+public:
+ explicit Inst(Opcode op)
+ : op(op) {}
+
+ /// Determines whether or not this instruction performs an arithmetic shift.
+ bool IsArithmeticShift() const;
+ /// Determines whether or not this instruction performs a logical shift.
+ bool IsLogicalShift() const;
+ /// Determines whether or not this instruction performs a circular shift.
+ bool IsCircularShift() const;
+ /// Determines whether or not this instruction performs any kind of shift.
+ bool IsShift() const;
+
+ /// Determines whether or not this instruction is a form of barrier.
+ bool IsBarrier() const;
+
+ /// Determines whether or not this instruction performs a shared memory read.
+ bool IsSharedMemoryRead() const;
+ /// Determines whether or not this instruction performs a shared memory write.
+ bool IsSharedMemoryWrite() const;
+ /// Determines whether or not this instruction performs a shared memory read or write.
+ bool IsSharedMemoryReadOrWrite() const;
+ /// Determines whether or not this instruction performs an atomic memory read.
+ bool IsExclusiveMemoryRead() const;
+ /// Determines whether or not this instruction performs an atomic memory write.
+ bool IsExclusiveMemoryWrite() const;
+
+ /// Determines whether or not this instruction performs any kind of memory read.
+ bool IsMemoryRead() const;
+ /// Determines whether or not this instruction performs any kind of memory write.
+ bool IsMemoryWrite() const;
+ /// Determines whether or not this instruction performs any kind of memory access.
+ bool IsMemoryReadOrWrite() const;
+
+ /// Determines whether or not this instruction reads from the CPSR.
+ bool ReadsFromCPSR() const;
+ /// Determines whether or not this instruction writes to the CPSR.
+ bool WritesToCPSR() const;
+
+ /// Determines whether or not this instruction writes to a system register.
+ bool WritesToSystemRegister() const;
+
+ /// Determines whether or not this instruction reads from a core register.
+ bool ReadsFromCoreRegister() const;
+ /// Determines whether or not this instruction writes to a core register.
+ bool WritesToCoreRegister() const;
+
+ /// Determines whether or not this instruction reads from the FPCR.
+ bool ReadsFromFPCR() const;
+ /// Determines whether or not this instruction writes to the FPCR.
+ bool WritesToFPCR() const;
+
+ /// Determines whether or not this instruction reads from the FPSR.
+ bool ReadsFromFPSR() const;
+ /// Determines whether or not this instruction writes to the FPSR.
+ bool WritesToFPSR() const;
+
+ /// Determines whether or not this instruction reads from the FPSR cumulative exception bits.
+ bool ReadsFromFPSRCumulativeExceptionBits() const;
+ /// Determines whether or not this instruction writes to the FPSR cumulative exception bits.
+ bool WritesToFPSRCumulativeExceptionBits() const;
+ /// Determines whether or not this instruction both reads from and writes to the FPSR cumulative exception bits.
+ bool ReadsFromAndWritesToFPSRCumulativeExceptionBits() const;
+
+ /// Determines whether or not this instruction reads from the FPSR cumulative saturation bit.
+ bool ReadsFromFPSRCumulativeSaturationBit() const;
+ /// Determines whether or not this instruction writes to the FPSR cumulative saturation bit.
+ bool WritesToFPSRCumulativeSaturationBit() const;
+
+ /// Determines whether or not this instruction alters memory-exclusivity.
+ bool AltersExclusiveState() const;
+
+ /// Determines whether or not this instruction accesses a coprocessor.
+ bool IsCoprocessorInstruction() const;
+
+ /// Determines whether or not this instruction causes a CPU exception.
+ bool CausesCPUException() const;
+
+ /// Determines whether or not this instruction is a SetCheckBit operation.
+ bool IsSetCheckBitOperation() const;
+
+ /// Determines whether or not this instruction may have side-effects.
+ bool MayHaveSideEffects() const;
+
+ /// Determines whether or not this instruction is a pseduo-instruction.
+ /// Pseudo-instructions depend on their parent instructions for their semantics.
+ bool IsAPseudoOperation() const;
+
+ /// Determines whether or not this instruction supports the GetNZCVFromOp pseudo-operation.
+ bool MayGetNZCVFromOp() const;
+
+ /// Determines if all arguments of this instruction are immediates.
+ bool AreAllArgsImmediates() const;
+
+ size_t UseCount() const { return use_count; }
+ bool HasUses() const { return use_count > 0; }
+
+ /// Determines if there is a pseudo-operation associated with this instruction.
+ bool HasAssociatedPseudoOperation() const;
+ /// Gets a pseudo-operation associated with this instruction.
+ Inst* GetAssociatedPseudoOperation(Opcode opcode);
+
+ /// Get the microop this microinstruction represents.
+ Opcode GetOpcode() const { return op; }
+ /// Get the type this instruction returns.
+ Type GetType() const;
+ /// Get the number of arguments this instruction has.
+ size_t NumArgs() const;
+
+ Value GetArg(size_t index) const;
+ void SetArg(size_t index, Value value);
+
+ void Invalidate();
+ void ClearArgs();
+
+ void ReplaceUsesWith(Value replacement);
+
+ // IR name (i.e. instruction number in block). This is set in the naming pass. Treat 0 as an invalid name.
+ // This is used for debugging and fastmem instruction identification.
+ void SetName(unsigned value) { name = value; }
+ unsigned GetName() const { return name; }
+
+private:
+ void Use(const Value& value);
+ void UndoUse(const Value& value);
+
+ Opcode op;
+ unsigned use_count = 0;
+ unsigned name = 0;
+ std::array<Value, max_arg_count> args;
+
+ // Linked list of pseudooperations associated with this instruction.
+ Inst* next_pseudoop = nullptr;
+};
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.cpp b/externals/dynarmic/src/dynarmic/ir/opcodes.cpp
new file mode 100644
index 0000000000..88a52d4f1d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opcodes.cpp
@@ -0,0 +1,71 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/opcodes.h"
+
+#include <array>
+#include <vector>
+
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::IR {
+
+// Opcode information
+
+namespace OpcodeInfo {
+
+struct Meta {
+ const char* name;
+ Type type;
+ std::vector<Type> arg_types;
+};
+
+constexpr Type Void = Type::Void;
+constexpr Type A32Reg = Type::A32Reg;
+constexpr Type A32ExtReg = Type::A32ExtReg;
+constexpr Type A64Reg = Type::A64Reg;
+constexpr Type A64Vec = Type::A64Vec;
+constexpr Type Opaque = Type::Opaque;
+constexpr Type U1 = Type::U1;
+constexpr Type U8 = Type::U8;
+constexpr Type U16 = Type::U16;
+constexpr Type U32 = Type::U32;
+constexpr Type U64 = Type::U64;
+constexpr Type U128 = Type::U128;
+constexpr Type CoprocInfo = Type::CoprocInfo;
+constexpr Type NZCV = Type::NZCVFlags;
+constexpr Type Cond = Type::Cond;
+constexpr Type Table = Type::Table;
+constexpr Type AccType = Type::AccType;
+
+static const std::array opcode_info{
+#define OPCODE(name, type, ...) Meta{#name, type, {__VA_ARGS__}},
+#define A32OPC(name, type, ...) Meta{#name, type, {__VA_ARGS__}},
+#define A64OPC(name, type, ...) Meta{#name, type, {__VA_ARGS__}},
+#include "./opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+};
+
+} // namespace OpcodeInfo
+
+Type GetTypeOf(Opcode op) {
+ return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).type;
+}
+
+size_t GetNumArgsOf(Opcode op) {
+ return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).arg_types.size();
+}
+
+Type GetArgTypeOf(Opcode op, size_t arg_index) {
+ return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).arg_types.at(arg_index);
+}
+
+std::string GetNameOf(Opcode op) {
+ return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).name;
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.h b/externals/dynarmic/src/dynarmic/ir/opcodes.h
new file mode 100644
index 0000000000..404f4cdbf4
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opcodes.h
@@ -0,0 +1,54 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+
+#include <fmt/format.h>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::IR {
+
+enum class Type;
+
+/**
+ * The Opcodes of our intermediate representation.
+ * Type signatures for each opcode can be found in opcodes.inc
+ */
+enum class Opcode {
+#define OPCODE(name, type, ...) name,
+#define A32OPC(name, type, ...) A32##name,
+#define A64OPC(name, type, ...) A64##name,
+#include "./opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+ NUM_OPCODE
+};
+
+constexpr size_t OpcodeCount = static_cast<size_t>(Opcode::NUM_OPCODE);
+
+/// Get return type of an opcode
+Type GetTypeOf(Opcode op);
+
+/// Get the number of arguments an opcode accepts
+size_t GetNumArgsOf(Opcode op);
+
+/// Get the required type of an argument of an opcode
+Type GetArgTypeOf(Opcode op, size_t arg_index);
+
+/// Get the name of an opcode.
+std::string GetNameOf(Opcode op);
+
+} // namespace Dynarmic::IR
+
+template<>
+struct fmt::formatter<Dynarmic::IR::Opcode> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::IR::Opcode op, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::IR::GetNameOf(op), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.inc b/externals/dynarmic/src/dynarmic/ir/opcodes.inc
new file mode 100644
index 0000000000..dd060e0a99
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opcodes.inc
@@ -0,0 +1,770 @@
+// clang-format off
+
+// opcode name, return type, arg1 type, arg2 type, arg3 type, arg4 type, ...
+
+OPCODE(Void, Void, )
+OPCODE(Identity, Opaque, Opaque )
+OPCODE(Breakpoint, Void, )
+OPCODE(CallHostFunction, Void, U64, Opaque, Opaque, Opaque )
+
+// A32 Context getters/setters
+A32OPC(SetCheckBit, Void, U1 )
+A32OPC(GetRegister, U32, A32Reg )
+A32OPC(GetExtendedRegister32, U32, A32ExtReg )
+A32OPC(GetExtendedRegister64, U64, A32ExtReg )
+A32OPC(GetVector, U128, A32ExtReg )
+A32OPC(SetRegister, Void, A32Reg, U32 )
+A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 )
+A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 )
+A32OPC(SetVector, Void, A32ExtReg, U128 )
+A32OPC(GetCpsr, U32, )
+A32OPC(SetCpsr, Void, U32 )
+A32OPC(SetCpsrNZCV, Void, NZCV )
+A32OPC(SetCpsrNZCVRaw, Void, U32 )
+A32OPC(SetCpsrNZCVQ, Void, U32 )
+A32OPC(SetCpsrNZ, Void, NZCV )
+A32OPC(SetCpsrNZC, Void, NZCV, U1 )
+A32OPC(GetCFlag, U1, )
+A32OPC(OrQFlag, Void, U1 )
+A32OPC(GetGEFlags, U32, )
+A32OPC(SetGEFlags, Void, U32 )
+A32OPC(SetGEFlagsCompressed, Void, U32 )
+A32OPC(BXWritePC, Void, U32 )
+A32OPC(UpdateUpperLocationDescriptor, Void, )
+A32OPC(CallSupervisor, Void, U32 )
+A32OPC(ExceptionRaised, Void, U32, U64 )
+A32OPC(DataSynchronizationBarrier, Void, )
+A32OPC(DataMemoryBarrier, Void, )
+A32OPC(InstructionSynchronizationBarrier, Void, )
+A32OPC(GetFpscr, U32, )
+A32OPC(SetFpscr, Void, U32, )
+A32OPC(GetFpscrNZCV, U32, )
+A32OPC(SetFpscrNZCV, Void, NZCV )
+
+// A64 Context getters/setters
+A64OPC(SetCheckBit, Void, U1 )
+A64OPC(GetCFlag, U1, )
+A64OPC(GetNZCVRaw, U32, )
+A64OPC(SetNZCVRaw, Void, U32 )
+A64OPC(SetNZCV, Void, NZCV )
+A64OPC(GetW, U32, A64Reg )
+A64OPC(GetX, U64, A64Reg )
+A64OPC(GetS, U128, A64Vec )
+A64OPC(GetD, U128, A64Vec )
+A64OPC(GetQ, U128, A64Vec )
+A64OPC(GetSP, U64, )
+A64OPC(GetFPCR, U32, )
+A64OPC(GetFPSR, U32, )
+A64OPC(SetW, Void, A64Reg, U32 )
+A64OPC(SetX, Void, A64Reg, U64 )
+A64OPC(SetS, Void, A64Vec, U128 )
+A64OPC(SetD, Void, A64Vec, U128 )
+A64OPC(SetQ, Void, A64Vec, U128 )
+A64OPC(SetSP, Void, U64 )
+A64OPC(SetFPCR, Void, U32 )
+A64OPC(SetFPSR, Void, U32 )
+A64OPC(SetPC, Void, U64 )
+A64OPC(CallSupervisor, Void, U32 )
+A64OPC(ExceptionRaised, Void, U64, U64 )
+A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 )
+A64OPC(InstructionCacheOperationRaised, Void, U64, U64 )
+A64OPC(DataSynchronizationBarrier, Void, )
+A64OPC(DataMemoryBarrier, Void, )
+A64OPC(InstructionSynchronizationBarrier, Void, )
+A64OPC(GetCNTFRQ, U32, )
+A64OPC(GetCNTPCT, U64, )
+A64OPC(GetCTR, U32, )
+A64OPC(GetDCZID, U32, )
+A64OPC(GetTPIDR, U64, )
+A64OPC(GetTPIDRRO, U64, )
+A64OPC(SetTPIDR, Void, U64 )
+
+// Hints
+OPCODE(PushRSB, Void, U64 )
+
+// Pseudo-operation, handled specially at final emit
+OPCODE(GetCarryFromOp, U1, Opaque )
+OPCODE(GetOverflowFromOp, U1, Opaque )
+OPCODE(GetGEFromOp, U32, Opaque )
+OPCODE(GetNZCVFromOp, NZCV, Opaque )
+OPCODE(GetNZFromOp, NZCV, Opaque )
+OPCODE(GetUpperFromOp, U128, Opaque )
+OPCODE(GetLowerFromOp, U128, Opaque )
+
+OPCODE(GetCFlagFromNZCV, U1, NZCV )
+OPCODE(NZCVFromPackedFlags, NZCV, U32 )
+
+// Calculations
+OPCODE(Pack2x32To1x64, U64, U32, U32 )
+OPCODE(Pack2x64To1x128, U128, U64, U64 )
+OPCODE(LeastSignificantWord, U32, U64 )
+OPCODE(LeastSignificantHalf, U16, U32 )
+OPCODE(LeastSignificantByte, U8, U32 )
+OPCODE(MostSignificantWord, U32, U64 )
+OPCODE(MostSignificantBit, U1, U32 )
+OPCODE(IsZero32, U1, U32 )
+OPCODE(IsZero64, U1, U64 )
+OPCODE(TestBit, U1, U64, U8 )
+OPCODE(ConditionalSelect32, U32, Cond, U32, U32 )
+OPCODE(ConditionalSelect64, U64, Cond, U64, U64 )
+OPCODE(ConditionalSelectNZCV, NZCV, Cond, NZCV, NZCV )
+OPCODE(LogicalShiftLeft32, U32, U32, U8, U1 )
+OPCODE(LogicalShiftLeft64, U64, U64, U8 )
+OPCODE(LogicalShiftRight32, U32, U32, U8, U1 )
+OPCODE(LogicalShiftRight64, U64, U64, U8 )
+OPCODE(ArithmeticShiftRight32, U32, U32, U8, U1 )
+OPCODE(ArithmeticShiftRight64, U64, U64, U8 )
+OPCODE(RotateRight32, U32, U32, U8, U1 )
+OPCODE(RotateRight64, U64, U64, U8 )
+OPCODE(RotateRightExtended, U32, U32, U1 )
+OPCODE(LogicalShiftLeftMasked32, U32, U32, U32 )
+OPCODE(LogicalShiftLeftMasked64, U64, U64, U64 )
+OPCODE(LogicalShiftRightMasked32, U32, U32, U32 )
+OPCODE(LogicalShiftRightMasked64, U64, U64, U64 )
+OPCODE(ArithmeticShiftRightMasked32, U32, U32, U32 )
+OPCODE(ArithmeticShiftRightMasked64, U64, U64, U64 )
+OPCODE(RotateRightMasked32, U32, U32, U32 )
+OPCODE(RotateRightMasked64, U64, U64, U64 )
+OPCODE(Add32, U32, U32, U32, U1 )
+OPCODE(Add64, U64, U64, U64, U1 )
+OPCODE(Sub32, U32, U32, U32, U1 )
+OPCODE(Sub64, U64, U64, U64, U1 )
+OPCODE(Mul32, U32, U32, U32 )
+OPCODE(Mul64, U64, U64, U64 )
+OPCODE(SignedMultiplyHigh64, U64, U64, U64 )
+OPCODE(UnsignedMultiplyHigh64, U64, U64, U64 )
+OPCODE(UnsignedDiv32, U32, U32, U32 )
+OPCODE(UnsignedDiv64, U64, U64, U64 )
+OPCODE(SignedDiv32, U32, U32, U32 )
+OPCODE(SignedDiv64, U64, U64, U64 )
+OPCODE(And32, U32, U32, U32 )
+OPCODE(And64, U64, U64, U64 )
+OPCODE(AndNot32, U32, U32, U32 )
+OPCODE(AndNot64, U64, U64, U64 )
+OPCODE(Eor32, U32, U32, U32 )
+OPCODE(Eor64, U64, U64, U64 )
+OPCODE(Or32, U32, U32, U32 )
+OPCODE(Or64, U64, U64, U64 )
+OPCODE(Not32, U32, U32 )
+OPCODE(Not64, U64, U64 )
+OPCODE(SignExtendByteToWord, U32, U8 )
+OPCODE(SignExtendHalfToWord, U32, U16 )
+OPCODE(SignExtendByteToLong, U64, U8 )
+OPCODE(SignExtendHalfToLong, U64, U16 )
+OPCODE(SignExtendWordToLong, U64, U32 )
+OPCODE(ZeroExtendByteToWord, U32, U8 )
+OPCODE(ZeroExtendHalfToWord, U32, U16 )
+OPCODE(ZeroExtendByteToLong, U64, U8 )
+OPCODE(ZeroExtendHalfToLong, U64, U16 )
+OPCODE(ZeroExtendWordToLong, U64, U32 )
+OPCODE(ZeroExtendLongToQuad, U128, U64 )
+OPCODE(ByteReverseWord, U32, U32 )
+OPCODE(ByteReverseHalf, U16, U16 )
+OPCODE(ByteReverseDual, U64, U64 )
+OPCODE(CountLeadingZeros32, U32, U32 )
+OPCODE(CountLeadingZeros64, U64, U64 )
+OPCODE(ExtractRegister32, U32, U32, U32, U8 )
+OPCODE(ExtractRegister64, U64, U64, U64, U8 )
+OPCODE(ReplicateBit32, U32, U32, U8 )
+OPCODE(ReplicateBit64, U64, U64, U8 )
+OPCODE(MaxSigned32, U32, U32, U32 )
+OPCODE(MaxSigned64, U64, U64, U64 )
+OPCODE(MaxUnsigned32, U32, U32, U32 )
+OPCODE(MaxUnsigned64, U64, U64, U64 )
+OPCODE(MinSigned32, U32, U32, U32 )
+OPCODE(MinSigned64, U64, U64, U64 )
+OPCODE(MinUnsigned32, U32, U32, U32 )
+OPCODE(MinUnsigned64, U64, U64, U64 )
+
+// Saturated instructions
+OPCODE(SignedSaturatedAddWithFlag32, U32, U32, U32 )
+OPCODE(SignedSaturatedSubWithFlag32, U32, U32, U32 )
+OPCODE(SignedSaturation, U32, U32, U8 )
+OPCODE(UnsignedSaturation, U32, U32, U8 )
+OPCODE(SignedSaturatedAdd8, U8, U8, U8 )
+OPCODE(SignedSaturatedAdd16, U16, U16, U16 )
+OPCODE(SignedSaturatedAdd32, U32, U32, U32 )
+OPCODE(SignedSaturatedAdd64, U64, U64, U64 )
+OPCODE(SignedSaturatedDoublingMultiplyReturnHigh16, U16, U16, U16 )
+OPCODE(SignedSaturatedDoublingMultiplyReturnHigh32, U32, U32, U32 )
+OPCODE(SignedSaturatedSub8, U8, U8, U8 )
+OPCODE(SignedSaturatedSub16, U16, U16, U16 )
+OPCODE(SignedSaturatedSub32, U32, U32, U32 )
+OPCODE(SignedSaturatedSub64, U64, U64, U64 )
+OPCODE(UnsignedSaturatedAdd8, U8, U8, U8 )
+OPCODE(UnsignedSaturatedAdd16, U16, U16, U16 )
+OPCODE(UnsignedSaturatedAdd32, U32, U32, U32 )
+OPCODE(UnsignedSaturatedAdd64, U64, U64, U64 )
+OPCODE(UnsignedSaturatedSub8, U8, U8, U8 )
+OPCODE(UnsignedSaturatedSub16, U16, U16, U16 )
+OPCODE(UnsignedSaturatedSub32, U32, U32, U32 )
+OPCODE(UnsignedSaturatedSub64, U64, U64, U64 )
+
+// Vector saturated instructions
+OPCODE(VectorSignedSaturatedAdd8, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAdd16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAdd32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAdd64, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedSub8, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedSub16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedSub32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedSub64, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAdd8, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAdd16, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAdd32, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAdd64, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedSub8, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedSub16, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedSub32, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedSub64, U128, U128, U128 )
+
+// Packed instructions
+OPCODE(PackedAddU8, U32, U32, U32 )
+OPCODE(PackedAddS8, U32, U32, U32 )
+OPCODE(PackedSubU8, U32, U32, U32 )
+OPCODE(PackedSubS8, U32, U32, U32 )
+OPCODE(PackedAddU16, U32, U32, U32 )
+OPCODE(PackedAddS16, U32, U32, U32 )
+OPCODE(PackedSubU16, U32, U32, U32 )
+OPCODE(PackedSubS16, U32, U32, U32 )
+OPCODE(PackedAddSubU16, U32, U32, U32 )
+OPCODE(PackedAddSubS16, U32, U32, U32 )
+OPCODE(PackedSubAddU16, U32, U32, U32 )
+OPCODE(PackedSubAddS16, U32, U32, U32 )
+OPCODE(PackedHalvingAddU8, U32, U32, U32 )
+OPCODE(PackedHalvingAddS8, U32, U32, U32 )
+OPCODE(PackedHalvingSubU8, U32, U32, U32 )
+OPCODE(PackedHalvingSubS8, U32, U32, U32 )
+OPCODE(PackedHalvingAddU16, U32, U32, U32 )
+OPCODE(PackedHalvingAddS16, U32, U32, U32 )
+OPCODE(PackedHalvingSubU16, U32, U32, U32 )
+OPCODE(PackedHalvingSubS16, U32, U32, U32 )
+OPCODE(PackedHalvingAddSubU16, U32, U32, U32 )
+OPCODE(PackedHalvingAddSubS16, U32, U32, U32 )
+OPCODE(PackedHalvingSubAddU16, U32, U32, U32 )
+OPCODE(PackedHalvingSubAddS16, U32, U32, U32 )
+OPCODE(PackedSaturatedAddU8, U32, U32, U32 )
+OPCODE(PackedSaturatedAddS8, U32, U32, U32 )
+OPCODE(PackedSaturatedSubU8, U32, U32, U32 )
+OPCODE(PackedSaturatedSubS8, U32, U32, U32 )
+OPCODE(PackedSaturatedAddU16, U32, U32, U32 )
+OPCODE(PackedSaturatedAddS16, U32, U32, U32 )
+OPCODE(PackedSaturatedSubU16, U32, U32, U32 )
+OPCODE(PackedSaturatedSubS16, U32, U32, U32 )
+OPCODE(PackedAbsDiffSumU8, U32, U32, U32 )
+OPCODE(PackedSelect, U32, U32, U32, U32 )
+
+// CRC instructions
+OPCODE(CRC32Castagnoli8, U32, U32, U32 )
+OPCODE(CRC32Castagnoli16, U32, U32, U32 )
+OPCODE(CRC32Castagnoli32, U32, U32, U32 )
+OPCODE(CRC32Castagnoli64, U32, U32, U64 )
+OPCODE(CRC32ISO8, U32, U32, U32 )
+OPCODE(CRC32ISO16, U32, U32, U32 )
+OPCODE(CRC32ISO32, U32, U32, U32 )
+OPCODE(CRC32ISO64, U32, U32, U64 )
+
+// AES instructions
+OPCODE(AESDecryptSingleRound, U128, U128 )
+OPCODE(AESEncryptSingleRound, U128, U128 )
+OPCODE(AESInverseMixColumns, U128, U128 )
+OPCODE(AESMixColumns, U128, U128 )
+
+// SM4 instructions
+OPCODE(SM4AccessSubstitutionBox, U8, U8 )
+
+// SHA instructions
+OPCODE(SHA256Hash, U128, U128, U128, U128, U1 )
+OPCODE(SHA256MessageSchedule0, U128, U128, U128 )
+OPCODE(SHA256MessageSchedule1, U128, U128, U128, U128 )
+
+// Vector instructions
+OPCODE(VectorGetElement8, U8, U128, U8 )
+OPCODE(VectorGetElement16, U16, U128, U8 )
+OPCODE(VectorGetElement32, U32, U128, U8 )
+OPCODE(VectorGetElement64, U64, U128, U8 )
+OPCODE(VectorSetElement8, U128, U128, U8, U8 )
+OPCODE(VectorSetElement16, U128, U128, U8, U16 )
+OPCODE(VectorSetElement32, U128, U128, U8, U32 )
+OPCODE(VectorSetElement64, U128, U128, U8, U64 )
+OPCODE(VectorAbs8, U128, U128 )
+OPCODE(VectorAbs16, U128, U128 )
+OPCODE(VectorAbs32, U128, U128 )
+OPCODE(VectorAbs64, U128, U128 )
+OPCODE(VectorAdd8, U128, U128, U128 )
+OPCODE(VectorAdd16, U128, U128, U128 )
+OPCODE(VectorAdd32, U128, U128, U128 )
+OPCODE(VectorAdd64, U128, U128, U128 )
+OPCODE(VectorAnd, U128, U128, U128 )
+OPCODE(VectorAndNot, U128, U128, U128 )
+OPCODE(VectorArithmeticShiftRight8, U128, U128, U8 )
+OPCODE(VectorArithmeticShiftRight16, U128, U128, U8 )
+OPCODE(VectorArithmeticShiftRight32, U128, U128, U8 )
+OPCODE(VectorArithmeticShiftRight64, U128, U128, U8 )
+OPCODE(VectorArithmeticVShift8, U128, U128, U128 )
+OPCODE(VectorArithmeticVShift16, U128, U128, U128 )
+OPCODE(VectorArithmeticVShift32, U128, U128, U128 )
+OPCODE(VectorArithmeticVShift64, U128, U128, U128 )
+OPCODE(VectorBroadcastLower8, U128, U8 )
+OPCODE(VectorBroadcastLower16, U128, U16 )
+OPCODE(VectorBroadcastLower32, U128, U32 )
+OPCODE(VectorBroadcast8, U128, U8 )
+OPCODE(VectorBroadcast16, U128, U16 )
+OPCODE(VectorBroadcast32, U128, U32 )
+OPCODE(VectorBroadcast64, U128, U64 )
+OPCODE(VectorBroadcastElementLower8, U128, U128, U8 )
+OPCODE(VectorBroadcastElementLower16, U128, U128, U8 )
+OPCODE(VectorBroadcastElementLower32, U128, U128, U8 )
+OPCODE(VectorBroadcastElement8, U128, U128, U8 )
+OPCODE(VectorBroadcastElement16, U128, U128, U8 )
+OPCODE(VectorBroadcastElement32, U128, U128, U8 )
+OPCODE(VectorBroadcastElement64, U128, U128, U8 )
+OPCODE(VectorCountLeadingZeros8, U128, U128 )
+OPCODE(VectorCountLeadingZeros16, U128, U128 )
+OPCODE(VectorCountLeadingZeros32, U128, U128 )
+OPCODE(VectorDeinterleaveEven8, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEven16, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEven32, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEven64, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEvenLower8, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEvenLower16, U128, U128, U128 )
+OPCODE(VectorDeinterleaveEvenLower32, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOdd8, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOdd16, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOdd32, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOdd64, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOddLower8, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOddLower16, U128, U128, U128 )
+OPCODE(VectorDeinterleaveOddLower32, U128, U128, U128 )
+OPCODE(VectorEor, U128, U128, U128 )
+OPCODE(VectorEqual8, U128, U128, U128 )
+OPCODE(VectorEqual16, U128, U128, U128 )
+OPCODE(VectorEqual32, U128, U128, U128 )
+OPCODE(VectorEqual64, U128, U128, U128 )
+OPCODE(VectorEqual128, U128, U128, U128 )
+OPCODE(VectorExtract, U128, U128, U128, U8 )
+OPCODE(VectorExtractLower, U128, U128, U128, U8 )
+OPCODE(VectorGreaterS8, U128, U128, U128 )
+OPCODE(VectorGreaterS16, U128, U128, U128 )
+OPCODE(VectorGreaterS32, U128, U128, U128 )
+OPCODE(VectorGreaterS64, U128, U128, U128 )
+OPCODE(VectorHalvingAddS8, U128, U128, U128 )
+OPCODE(VectorHalvingAddS16, U128, U128, U128 )
+OPCODE(VectorHalvingAddS32, U128, U128, U128 )
+OPCODE(VectorHalvingAddU8, U128, U128, U128 )
+OPCODE(VectorHalvingAddU16, U128, U128, U128 )
+OPCODE(VectorHalvingAddU32, U128, U128, U128 )
+OPCODE(VectorHalvingSubS8, U128, U128, U128 )
+OPCODE(VectorHalvingSubS16, U128, U128, U128 )
+OPCODE(VectorHalvingSubS32, U128, U128, U128 )
+OPCODE(VectorHalvingSubU8, U128, U128, U128 )
+OPCODE(VectorHalvingSubU16, U128, U128, U128 )
+OPCODE(VectorHalvingSubU32, U128, U128, U128 )
+OPCODE(VectorInterleaveLower8, U128, U128, U128 )
+OPCODE(VectorInterleaveLower16, U128, U128, U128 )
+OPCODE(VectorInterleaveLower32, U128, U128, U128 )
+OPCODE(VectorInterleaveLower64, U128, U128, U128 )
+OPCODE(VectorInterleaveUpper8, U128, U128, U128 )
+OPCODE(VectorInterleaveUpper16, U128, U128, U128 )
+OPCODE(VectorInterleaveUpper32, U128, U128, U128 )
+OPCODE(VectorInterleaveUpper64, U128, U128, U128 )
+OPCODE(VectorLogicalShiftLeft8, U128, U128, U8 )
+OPCODE(VectorLogicalShiftLeft16, U128, U128, U8 )
+OPCODE(VectorLogicalShiftLeft32, U128, U128, U8 )
+OPCODE(VectorLogicalShiftLeft64, U128, U128, U8 )
+OPCODE(VectorLogicalShiftRight8, U128, U128, U8 )
+OPCODE(VectorLogicalShiftRight16, U128, U128, U8 )
+OPCODE(VectorLogicalShiftRight32, U128, U128, U8 )
+OPCODE(VectorLogicalShiftRight64, U128, U128, U8 )
+OPCODE(VectorLogicalVShift8, U128, U128, U128 )
+OPCODE(VectorLogicalVShift16, U128, U128, U128 )
+OPCODE(VectorLogicalVShift32, U128, U128, U128 )
+OPCODE(VectorLogicalVShift64, U128, U128, U128 )
+OPCODE(VectorMaxS8, U128, U128, U128 )
+OPCODE(VectorMaxS16, U128, U128, U128 )
+OPCODE(VectorMaxS32, U128, U128, U128 )
+OPCODE(VectorMaxS64, U128, U128, U128 )
+OPCODE(VectorMaxU8, U128, U128, U128 )
+OPCODE(VectorMaxU16, U128, U128, U128 )
+OPCODE(VectorMaxU32, U128, U128, U128 )
+OPCODE(VectorMaxU64, U128, U128, U128 )
+OPCODE(VectorMinS8, U128, U128, U128 )
+OPCODE(VectorMinS16, U128, U128, U128 )
+OPCODE(VectorMinS32, U128, U128, U128 )
+OPCODE(VectorMinS64, U128, U128, U128 )
+OPCODE(VectorMinU8, U128, U128, U128 )
+OPCODE(VectorMinU16, U128, U128, U128 )
+OPCODE(VectorMinU32, U128, U128, U128 )
+OPCODE(VectorMinU64, U128, U128, U128 )
+OPCODE(VectorMultiply8, U128, U128, U128 )
+OPCODE(VectorMultiply16, U128, U128, U128 )
+OPCODE(VectorMultiply32, U128, U128, U128 )
+OPCODE(VectorMultiply64, U128, U128, U128 )
+OPCODE(VectorMultiplySignedWiden8, U128, U128, U128 )
+OPCODE(VectorMultiplySignedWiden16, U128, U128, U128 )
+OPCODE(VectorMultiplySignedWiden32, U128, U128, U128 )
+OPCODE(VectorMultiplyUnsignedWiden8, U128, U128, U128 )
+OPCODE(VectorMultiplyUnsignedWiden16, U128, U128, U128 )
+OPCODE(VectorMultiplyUnsignedWiden32, U128, U128, U128 )
+OPCODE(VectorNarrow16, U128, U128 )
+OPCODE(VectorNarrow32, U128, U128 )
+OPCODE(VectorNarrow64, U128, U128 )
+OPCODE(VectorNot, U128, U128 )
+OPCODE(VectorOr, U128, U128, U128 )
+OPCODE(VectorPairedAddLower8, U128, U128, U128 )
+OPCODE(VectorPairedAddLower16, U128, U128, U128 )
+OPCODE(VectorPairedAddLower32, U128, U128, U128 )
+OPCODE(VectorPairedAddSignedWiden8, U128, U128 )
+OPCODE(VectorPairedAddSignedWiden16, U128, U128 )
+OPCODE(VectorPairedAddSignedWiden32, U128, U128 )
+OPCODE(VectorPairedAddUnsignedWiden8, U128, U128 )
+OPCODE(VectorPairedAddUnsignedWiden16, U128, U128 )
+OPCODE(VectorPairedAddUnsignedWiden32, U128, U128 )
+OPCODE(VectorPairedAdd8, U128, U128, U128 )
+OPCODE(VectorPairedAdd16, U128, U128, U128 )
+OPCODE(VectorPairedAdd32, U128, U128, U128 )
+OPCODE(VectorPairedAdd64, U128, U128, U128 )
+OPCODE(VectorPairedMaxS8, U128, U128, U128 )
+OPCODE(VectorPairedMaxS16, U128, U128, U128 )
+OPCODE(VectorPairedMaxS32, U128, U128, U128 )
+OPCODE(VectorPairedMaxU8, U128, U128, U128 )
+OPCODE(VectorPairedMaxU16, U128, U128, U128 )
+OPCODE(VectorPairedMaxU32, U128, U128, U128 )
+OPCODE(VectorPairedMinS8, U128, U128, U128 )
+OPCODE(VectorPairedMinS16, U128, U128, U128 )
+OPCODE(VectorPairedMinS32, U128, U128, U128 )
+OPCODE(VectorPairedMinU8, U128, U128, U128 )
+OPCODE(VectorPairedMinU16, U128, U128, U128 )
+OPCODE(VectorPairedMinU32, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerS8, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerS16, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerS32, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerU8, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerU16, U128, U128, U128 )
+OPCODE(VectorPairedMaxLowerU32, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerS8, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerS16, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerS32, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerU8, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerU16, U128, U128, U128 )
+OPCODE(VectorPairedMinLowerU32, U128, U128, U128 )
+OPCODE(VectorPolynomialMultiply8, U128, U128, U128 )
+OPCODE(VectorPolynomialMultiplyLong8, U128, U128, U128 )
+OPCODE(VectorPolynomialMultiplyLong64, U128, U128, U128 )
+OPCODE(VectorPopulationCount, U128, U128 )
+OPCODE(VectorReverseBits, U128, U128 )
+OPCODE(VectorReverseElementsInHalfGroups8, U128, U128 )
+OPCODE(VectorReverseElementsInWordGroups8, U128, U128 )
+OPCODE(VectorReverseElementsInWordGroups16, U128, U128 )
+OPCODE(VectorReverseElementsInLongGroups8, U128, U128 )
+OPCODE(VectorReverseElementsInLongGroups16, U128, U128 )
+OPCODE(VectorReverseElementsInLongGroups32, U128, U128 )
+OPCODE(VectorReduceAdd8, U128, U128 )
+OPCODE(VectorReduceAdd16, U128, U128 )
+OPCODE(VectorReduceAdd32, U128, U128 )
+OPCODE(VectorReduceAdd64, U128, U128 )
+OPCODE(VectorRotateWholeVectorRight, U128, U128, U8 )
+OPCODE(VectorRoundingHalvingAddS8, U128, U128, U128 )
+OPCODE(VectorRoundingHalvingAddS16, U128, U128, U128 )
+OPCODE(VectorRoundingHalvingAddS32, U128, U128, U128 )
+OPCODE(VectorRoundingHalvingAddU8, U128, U128, U128 )
+OPCODE(VectorRoundingHalvingAddU16, U128, U128, U128 )
+OPCODE(VectorRoundingHalvingAddU32, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftS8, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftS16, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftS32, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftS64, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftU8, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftU16, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftU32, U128, U128, U128 )
+OPCODE(VectorRoundingShiftLeftU64, U128, U128, U128 )
+OPCODE(VectorSignExtend8, U128, U128 )
+OPCODE(VectorSignExtend16, U128, U128 )
+OPCODE(VectorSignExtend32, U128, U128 )
+OPCODE(VectorSignExtend64, U128, U128 )
+OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 )
+OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 )
+OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 )
+OPCODE(VectorSignedMultiply16, Void, U128, U128 )
+OPCODE(VectorSignedMultiply32, Void, U128, U128 )
+OPCODE(VectorSignedSaturatedAbs8, U128, U128 )
+OPCODE(VectorSignedSaturatedAbs16, U128, U128 )
+OPCODE(VectorSignedSaturatedAbs32, U128, U128 )
+OPCODE(VectorSignedSaturatedAbs64, U128, U128 )
+OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyHigh16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyHigh32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyHighRounding16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyHighRounding32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyLong16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedDoublingMultiplyLong32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToUnsigned16, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToUnsigned32, U128, U128 )
+OPCODE(VectorSignedSaturatedNarrowToUnsigned64, U128, U128 )
+OPCODE(VectorSignedSaturatedNeg8, U128, U128 )
+OPCODE(VectorSignedSaturatedNeg16, U128, U128 )
+OPCODE(VectorSignedSaturatedNeg32, U128, U128 )
+OPCODE(VectorSignedSaturatedNeg64, U128, U128 )
+OPCODE(VectorSignedSaturatedShiftLeft8, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedShiftLeft16, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedShiftLeft32, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedShiftLeft64, U128, U128, U128 )
+OPCODE(VectorSignedSaturatedShiftLeftUnsigned8, U128, U128, U8 )
+OPCODE(VectorSignedSaturatedShiftLeftUnsigned16, U128, U128, U8 )
+OPCODE(VectorSignedSaturatedShiftLeftUnsigned32, U128, U128, U8 )
+OPCODE(VectorSignedSaturatedShiftLeftUnsigned64, U128, U128, U8 )
+OPCODE(VectorSub8, U128, U128, U128 )
+OPCODE(VectorSub16, U128, U128, U128 )
+OPCODE(VectorSub32, U128, U128, U128 )
+OPCODE(VectorSub64, U128, U128, U128 )
+OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque )
+OPCODE(VectorTableLookup64, U64, U64, Table, U64 )
+OPCODE(VectorTableLookup128, U128, U128, Table, U128 )
+OPCODE(VectorTranspose8, U128, U128, U128, U1 )
+OPCODE(VectorTranspose16, U128, U128, U128, U1 )
+OPCODE(VectorTranspose32, U128, U128, U128, U1 )
+OPCODE(VectorTranspose64, U128, U128, U128, U1 )
+OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 )
+OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 )
+OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )
+OPCODE(VectorUnsignedMultiply16, Void, U128, U128 )
+OPCODE(VectorUnsignedMultiply32, Void, U128, U128 )
+OPCODE(VectorUnsignedRecipEstimate, U128, U128 )
+OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAccumulateSigned16, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAccumulateSigned32, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedAccumulateSigned64, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedNarrow16, U128, U128 )
+OPCODE(VectorUnsignedSaturatedNarrow32, U128, U128 )
+OPCODE(VectorUnsignedSaturatedNarrow64, U128, U128 )
+OPCODE(VectorUnsignedSaturatedShiftLeft8, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedShiftLeft16, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedShiftLeft32, U128, U128, U128 )
+OPCODE(VectorUnsignedSaturatedShiftLeft64, U128, U128, U128 )
+OPCODE(VectorZeroExtend8, U128, U128 )
+OPCODE(VectorZeroExtend16, U128, U128 )
+OPCODE(VectorZeroExtend32, U128, U128 )
+OPCODE(VectorZeroExtend64, U128, U128 )
+OPCODE(VectorZeroUpper, U128, U128 )
+OPCODE(ZeroVector, U128, )
+
+// Floating-point operations
+OPCODE(FPAbs16, U16, U16 )
+OPCODE(FPAbs32, U32, U32 )
+OPCODE(FPAbs64, U64, U64 )
+OPCODE(FPAdd32, U32, U32, U32 )
+OPCODE(FPAdd64, U64, U64, U64 )
+OPCODE(FPCompare32, NZCV, U32, U32, U1 )
+OPCODE(FPCompare64, NZCV, U64, U64, U1 )
+OPCODE(FPDiv32, U32, U32, U32 )
+OPCODE(FPDiv64, U64, U64, U64 )
+OPCODE(FPMax32, U32, U32, U32 )
+OPCODE(FPMax64, U64, U64, U64 )
+OPCODE(FPMaxNumeric32, U32, U32, U32 )
+OPCODE(FPMaxNumeric64, U64, U64, U64 )
+OPCODE(FPMin32, U32, U32, U32 )
+OPCODE(FPMin64, U64, U64, U64 )
+OPCODE(FPMinNumeric32, U32, U32, U32 )
+OPCODE(FPMinNumeric64, U64, U64, U64 )
+OPCODE(FPMul32, U32, U32, U32 )
+OPCODE(FPMul64, U64, U64, U64 )
+OPCODE(FPMulAdd16, U16, U16, U16, U16 )
+OPCODE(FPMulAdd32, U32, U32, U32, U32 )
+OPCODE(FPMulAdd64, U64, U64, U64, U64 )
+OPCODE(FPMulSub16, U16, U16, U16, U16 )
+OPCODE(FPMulSub32, U32, U32, U32, U32 )
+OPCODE(FPMulSub64, U64, U64, U64, U64 )
+OPCODE(FPMulX32, U32, U32, U32 )
+OPCODE(FPMulX64, U64, U64, U64 )
+OPCODE(FPNeg16, U16, U16 )
+OPCODE(FPNeg32, U32, U32 )
+OPCODE(FPNeg64, U64, U64 )
+OPCODE(FPRecipEstimate16, U16, U16 )
+OPCODE(FPRecipEstimate32, U32, U32 )
+OPCODE(FPRecipEstimate64, U64, U64 )
+OPCODE(FPRecipExponent16, U16, U16 )
+OPCODE(FPRecipExponent32, U32, U32 )
+OPCODE(FPRecipExponent64, U64, U64 )
+OPCODE(FPRecipStepFused16, U16, U16, U16 )
+OPCODE(FPRecipStepFused32, U32, U32, U32 )
+OPCODE(FPRecipStepFused64, U64, U64, U64 )
+OPCODE(FPRoundInt16, U16, U16, U8, U1 )
+OPCODE(FPRoundInt32, U32, U32, U8, U1 )
+OPCODE(FPRoundInt64, U64, U64, U8, U1 )
+OPCODE(FPRSqrtEstimate16, U16, U16 )
+OPCODE(FPRSqrtEstimate32, U32, U32 )
+OPCODE(FPRSqrtEstimate64, U64, U64 )
+OPCODE(FPRSqrtStepFused16, U16, U16, U16 )
+OPCODE(FPRSqrtStepFused32, U32, U32, U32 )
+OPCODE(FPRSqrtStepFused64, U64, U64, U64 )
+OPCODE(FPSqrt32, U32, U32 )
+OPCODE(FPSqrt64, U64, U64 )
+OPCODE(FPSub32, U32, U32, U32 )
+OPCODE(FPSub64, U64, U64, U64 )
+
+// Floating-point conversions
+OPCODE(FPHalfToDouble, U64, U16, U8 )
+OPCODE(FPHalfToSingle, U32, U16, U8 )
+OPCODE(FPSingleToDouble, U64, U32, U8 )
+OPCODE(FPSingleToHalf, U16, U32, U8 )
+OPCODE(FPDoubleToHalf, U16, U64, U8 )
+OPCODE(FPDoubleToSingle, U32, U64, U8 )
+OPCODE(FPDoubleToFixedS16, U16, U64, U8, U8 )
+OPCODE(FPDoubleToFixedS32, U32, U64, U8, U8 )
+OPCODE(FPDoubleToFixedS64, U64, U64, U8, U8 )
+OPCODE(FPDoubleToFixedU16, U16, U64, U8, U8 )
+OPCODE(FPDoubleToFixedU32, U32, U64, U8, U8 )
+OPCODE(FPDoubleToFixedU64, U64, U64, U8, U8 )
+OPCODE(FPHalfToFixedS16, U16, U16, U8, U8 )
+OPCODE(FPHalfToFixedS32, U32, U16, U8, U8 )
+OPCODE(FPHalfToFixedS64, U64, U16, U8, U8 )
+OPCODE(FPHalfToFixedU16, U16, U16, U8, U8 )
+OPCODE(FPHalfToFixedU32, U32, U16, U8, U8 )
+OPCODE(FPHalfToFixedU64, U64, U16, U8, U8 )
+OPCODE(FPSingleToFixedS16, U16, U32, U8, U8 )
+OPCODE(FPSingleToFixedS32, U32, U32, U8, U8 )
+OPCODE(FPSingleToFixedS64, U64, U32, U8, U8 )
+OPCODE(FPSingleToFixedU16, U16, U32, U8, U8 )
+OPCODE(FPSingleToFixedU32, U32, U32, U8, U8 )
+OPCODE(FPSingleToFixedU64, U64, U32, U8, U8 )
+OPCODE(FPFixedU16ToSingle, U32, U16, U8, U8 )
+OPCODE(FPFixedS16ToSingle, U32, U16, U8, U8 )
+OPCODE(FPFixedU16ToDouble, U64, U16, U8, U8 )
+OPCODE(FPFixedS16ToDouble, U64, U16, U8, U8 )
+OPCODE(FPFixedU32ToSingle, U32, U32, U8, U8 )
+OPCODE(FPFixedS32ToSingle, U32, U32, U8, U8 )
+OPCODE(FPFixedU32ToDouble, U64, U32, U8, U8 )
+OPCODE(FPFixedS32ToDouble, U64, U32, U8, U8 )
+OPCODE(FPFixedU64ToDouble, U64, U64, U8, U8 )
+OPCODE(FPFixedU64ToSingle, U32, U64, U8, U8 )
+OPCODE(FPFixedS64ToDouble, U64, U64, U8, U8 )
+OPCODE(FPFixedS64ToSingle, U32, U64, U8, U8 )
+
+// Floating-point vector instructions
+OPCODE(FPVectorAbs16, U128, U128 )
+OPCODE(FPVectorAbs32, U128, U128 )
+OPCODE(FPVectorAbs64, U128, U128 )
+OPCODE(FPVectorAdd32, U128, U128, U128, U1 )
+OPCODE(FPVectorAdd64, U128, U128, U128, U1 )
+OPCODE(FPVectorDiv32, U128, U128, U128, U1 )
+OPCODE(FPVectorDiv64, U128, U128, U128, U1 )
+OPCODE(FPVectorEqual16, U128, U128, U128, U1 )
+OPCODE(FPVectorEqual32, U128, U128, U128, U1 )
+OPCODE(FPVectorEqual64, U128, U128, U128, U1 )
+OPCODE(FPVectorFromHalf32, U128, U128, U8, U1 )
+OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorFromUnsignedFixed64, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorGreater32, U128, U128, U128, U1 )
+OPCODE(FPVectorGreater64, U128, U128, U128, U1 )
+OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 )
+OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
+OPCODE(FPVectorMax32, U128, U128, U128, U1 )
+OPCODE(FPVectorMax64, U128, U128, U128, U1 )
+OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 )
+OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 )
+OPCODE(FPVectorMin32, U128, U128, U128, U1 )
+OPCODE(FPVectorMin64, U128, U128, U128, U1 )
+OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 )
+OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 )
+OPCODE(FPVectorMul32, U128, U128, U128, U1 )
+OPCODE(FPVectorMul64, U128, U128, U128, U1 )
+OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )
+OPCODE(FPVectorMulAdd32, U128, U128, U128, U128, U1 )
+OPCODE(FPVectorMulAdd64, U128, U128, U128, U128, U1 )
+OPCODE(FPVectorMulX32, U128, U128, U128, U1 )
+OPCODE(FPVectorMulX64, U128, U128, U128, U1 )
+OPCODE(FPVectorNeg16, U128, U128 )
+OPCODE(FPVectorNeg32, U128, U128 )
+OPCODE(FPVectorNeg64, U128, U128 )
+OPCODE(FPVectorPairedAdd32, U128, U128, U128, U1 )
+OPCODE(FPVectorPairedAdd64, U128, U128, U128, U1 )
+OPCODE(FPVectorPairedAddLower32, U128, U128, U128, U1 )
+OPCODE(FPVectorPairedAddLower64, U128, U128, U128, U1 )
+OPCODE(FPVectorRecipEstimate16, U128, U128, U1 )
+OPCODE(FPVectorRecipEstimate32, U128, U128, U1 )
+OPCODE(FPVectorRecipEstimate64, U128, U128, U1 )
+OPCODE(FPVectorRecipStepFused16, U128, U128, U128, U1 )
+OPCODE(FPVectorRecipStepFused32, U128, U128, U128, U1 )
+OPCODE(FPVectorRecipStepFused64, U128, U128, U128, U1 )
+OPCODE(FPVectorRoundInt16, U128, U128, U8, U1, U1 )
+OPCODE(FPVectorRoundInt32, U128, U128, U8, U1, U1 )
+OPCODE(FPVectorRoundInt64, U128, U128, U8, U1, U1 )
+OPCODE(FPVectorRSqrtEstimate16, U128, U128, U1 )
+OPCODE(FPVectorRSqrtEstimate32, U128, U128, U1 )
+OPCODE(FPVectorRSqrtEstimate64, U128, U128, U1 )
+OPCODE(FPVectorRSqrtStepFused16, U128, U128, U128, U1 )
+OPCODE(FPVectorRSqrtStepFused32, U128, U128, U128, U1 )
+OPCODE(FPVectorRSqrtStepFused64, U128, U128, U128, U1 )
+OPCODE(FPVectorSqrt32, U128, U128, U1 )
+OPCODE(FPVectorSqrt64, U128, U128, U1 )
+OPCODE(FPVectorSub32, U128, U128, U128, U1 )
+OPCODE(FPVectorSub64, U128, U128, U128, U1 )
+OPCODE(FPVectorToHalf32, U128, U128, U8, U1 )
+OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorToUnsignedFixed16, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorToUnsignedFixed32, U128, U128, U8, U8, U1 )
+OPCODE(FPVectorToUnsignedFixed64, U128, U128, U8, U8, U1 )
+
+// A32 Memory access
+A32OPC(ClearExclusive, Void, )
+A32OPC(ReadMemory8, U8, U64, U32, AccType )
+A32OPC(ReadMemory16, U16, U64, U32, AccType )
+A32OPC(ReadMemory32, U32, U64, U32, AccType )
+A32OPC(ReadMemory64, U64, U64, U32, AccType )
+A32OPC(ExclusiveReadMemory8, U8, U64, U32, AccType )
+A32OPC(ExclusiveReadMemory16, U16, U64, U32, AccType )
+A32OPC(ExclusiveReadMemory32, U32, U64, U32, AccType )
+A32OPC(ExclusiveReadMemory64, U64, U64, U32, AccType )
+A32OPC(WriteMemory8, Void, U64, U32, U8, AccType )
+A32OPC(WriteMemory16, Void, U64, U32, U16, AccType )
+A32OPC(WriteMemory32, Void, U64, U32, U32, AccType )
+A32OPC(WriteMemory64, Void, U64, U32, U64, AccType )
+A32OPC(ExclusiveWriteMemory8, U32, U64, U32, U8, AccType )
+A32OPC(ExclusiveWriteMemory16, U32, U64, U32, U16, AccType )
+A32OPC(ExclusiveWriteMemory32, U32, U64, U32, U32, AccType )
+A32OPC(ExclusiveWriteMemory64, U32, U64, U32, U64, AccType )
+
+// A64 Memory access
+A64OPC(ClearExclusive, Void, )
+A64OPC(ReadMemory8, U8, U64, U64, AccType )
+A64OPC(ReadMemory16, U16, U64, U64, AccType )
+A64OPC(ReadMemory32, U32, U64, U64, AccType )
+A64OPC(ReadMemory64, U64, U64, U64, AccType )
+A64OPC(ReadMemory128, U128, U64, U64, AccType )
+A64OPC(ExclusiveReadMemory8, U8, U64, U64, AccType )
+A64OPC(ExclusiveReadMemory16, U16, U64, U64, AccType )
+A64OPC(ExclusiveReadMemory32, U32, U64, U64, AccType )
+A64OPC(ExclusiveReadMemory64, U64, U64, U64, AccType )
+A64OPC(ExclusiveReadMemory128, U128, U64, U64, AccType )
+A64OPC(WriteMemory8, Void, U64, U64, U8, AccType )
+A64OPC(WriteMemory16, Void, U64, U64, U16, AccType )
+A64OPC(WriteMemory32, Void, U64, U64, U32, AccType )
+A64OPC(WriteMemory64, Void, U64, U64, U64, AccType )
+A64OPC(WriteMemory128, Void, U64, U64, U128, AccType )
+A64OPC(ExclusiveWriteMemory8, U32, U64, U64, U8, AccType )
+A64OPC(ExclusiveWriteMemory16, U32, U64, U64, U16, AccType )
+A64OPC(ExclusiveWriteMemory32, U32, U64, U64, U32, AccType )
+A64OPC(ExclusiveWriteMemory64, U32, U64, U64, U64, AccType )
+A64OPC(ExclusiveWriteMemory128, U32, U64, U64, U128, AccType )
+
+// Coprocessor
+A32OPC(CoprocInternalOperation, Void, CoprocInfo )
+A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 )
+A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 )
+A32OPC(CoprocGetOneWord, U32, CoprocInfo )
+A32OPC(CoprocGetTwoWords, U64, CoprocInfo )
+A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 )
+A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 )
+
+// clang-format on
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp
new file mode 100644
index 0000000000..9699f18345
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp
@@ -0,0 +1,70 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/interface/A32/config.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb) {
+ for (auto& inst : block) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::A32ReadMemory8: {
+ if (!inst.AreAllArgsImmediates()) {
+ break;
+ }
+
+ const u32 vaddr = inst.GetArg(1).GetU32();
+ if (cb->IsReadOnlyMemory(vaddr)) {
+ const u8 value_from_memory = cb->MemoryRead8(vaddr);
+ inst.ReplaceUsesWith(IR::Value{value_from_memory});
+ }
+ break;
+ }
+ case IR::Opcode::A32ReadMemory16: {
+ if (!inst.AreAllArgsImmediates()) {
+ break;
+ }
+
+ const u32 vaddr = inst.GetArg(1).GetU32();
+ if (cb->IsReadOnlyMemory(vaddr)) {
+ const u16 value_from_memory = cb->MemoryRead16(vaddr);
+ inst.ReplaceUsesWith(IR::Value{value_from_memory});
+ }
+ break;
+ }
+ case IR::Opcode::A32ReadMemory32: {
+ if (!inst.AreAllArgsImmediates()) {
+ break;
+ }
+
+ const u32 vaddr = inst.GetArg(1).GetU32();
+ if (cb->IsReadOnlyMemory(vaddr)) {
+ const u32 value_from_memory = cb->MemoryRead32(vaddr);
+ inst.ReplaceUsesWith(IR::Value{value_from_memory});
+ }
+ break;
+ }
+ case IR::Opcode::A32ReadMemory64: {
+ if (!inst.AreAllArgsImmediates()) {
+ break;
+ }
+
+ const u32 vaddr = inst.GetArg(1).GetU32();
+ if (cb->IsReadOnlyMemory(vaddr)) {
+ const u64 value_from_memory = cb->MemoryRead64(vaddr);
+ inst.ReplaceUsesWith(IR::Value{value_from_memory});
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp
new file mode 100644
index 0000000000..4b743bff0b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp
@@ -0,0 +1,377 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <algorithm>
+#include <array>
+#include <functional>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A32/a32_ir_emitter.h"
+#include "dynarmic/frontend/A32/a32_types.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::Optimization {
+
+namespace {
+
+void FlagsPass(IR::Block& block) {
+ using Iterator = std::reverse_iterator<IR::Block::iterator>;
+
+ struct FlagInfo {
+ bool set_not_required = false;
+ bool has_value_request = false;
+ Iterator value_request = {};
+ };
+ struct ValuelessFlagInfo {
+ bool set_not_required = false;
+ };
+ ValuelessFlagInfo nzcvq;
+ ValuelessFlagInfo nzcv;
+ ValuelessFlagInfo nz;
+ FlagInfo c_flag;
+ FlagInfo ge;
+
+ auto do_set = [&](FlagInfo& info, IR::Value value, Iterator inst) {
+ if (info.has_value_request) {
+ info.value_request->ReplaceUsesWith(value);
+ }
+ info.has_value_request = false;
+
+ if (info.set_not_required) {
+ inst->Invalidate();
+ }
+ info.set_not_required = true;
+ };
+
+ auto do_set_valueless = [&](ValuelessFlagInfo& info, Iterator inst) {
+ if (info.set_not_required) {
+ inst->Invalidate();
+ }
+ info.set_not_required = true;
+ };
+
+ auto do_get = [](FlagInfo& info, Iterator inst) {
+ if (info.has_value_request) {
+ info.value_request->ReplaceUsesWith(IR::Value{&*inst});
+ }
+ info.has_value_request = true;
+ info.value_request = inst;
+ };
+
+ A32::IREmitter ir{block, A32::LocationDescriptor{block.Location()}, {}};
+
+ for (auto inst = block.rbegin(); inst != block.rend(); ++inst) {
+ switch (inst->GetOpcode()) {
+ case IR::Opcode::A32GetCFlag: {
+ do_get(c_flag, inst);
+ break;
+ }
+ case IR::Opcode::A32SetCpsrNZCV: {
+ if (c_flag.has_value_request) {
+ ir.SetInsertionPointBefore(inst.base()); // base is one ahead
+ IR::U1 c = ir.GetCFlagFromNZCV(IR::NZCV{inst->GetArg(0)});
+ c_flag.value_request->ReplaceUsesWith(c);
+ c_flag.has_value_request = false;
+ break; // This case will be executed again because of the above
+ }
+
+ do_set_valueless(nzcv, inst);
+
+ nz = {.set_not_required = true};
+ c_flag = {.set_not_required = true};
+ break;
+ }
+ case IR::Opcode::A32SetCpsrNZCVRaw: {
+ if (c_flag.has_value_request) {
+ nzcv.set_not_required = false;
+ }
+
+ do_set_valueless(nzcv, inst);
+
+ nzcvq = {};
+ nz = {.set_not_required = true};
+ c_flag = {.set_not_required = true};
+ break;
+ }
+ case IR::Opcode::A32SetCpsrNZCVQ: {
+ if (c_flag.has_value_request) {
+ nzcvq.set_not_required = false;
+ }
+
+ do_set_valueless(nzcvq, inst);
+
+ nzcv = {.set_not_required = true};
+ nz = {.set_not_required = true};
+ c_flag = {.set_not_required = true};
+ break;
+ }
+ case IR::Opcode::A32SetCpsrNZ: {
+ do_set_valueless(nz, inst);
+
+ nzcvq = {};
+ nzcv = {};
+ break;
+ }
+ case IR::Opcode::A32SetCpsrNZC: {
+ if (c_flag.has_value_request) {
+ c_flag.value_request->ReplaceUsesWith(inst->GetArg(1));
+ c_flag.has_value_request = false;
+ }
+
+ if (!inst->GetArg(1).IsImmediate() && inst->GetArg(1).GetInstRecursive()->GetOpcode() == IR::Opcode::A32GetCFlag) {
+ const auto nz_value = inst->GetArg(0);
+
+ inst->Invalidate();
+
+ ir.SetInsertionPointBefore(inst.base());
+ ir.SetCpsrNZ(IR::NZCV{nz_value});
+
+ nzcvq = {};
+ nzcv = {};
+ nz = {.set_not_required = true};
+ break;
+ }
+
+ if (nz.set_not_required && c_flag.set_not_required) {
+ inst->Invalidate();
+ } else if (nz.set_not_required) {
+ inst->SetArg(0, IR::Value::EmptyNZCVImmediateMarker());
+ }
+ nz.set_not_required = true;
+ c_flag.set_not_required = true;
+
+ nzcv = {};
+ nzcvq = {};
+ break;
+ }
+ case IR::Opcode::A32SetGEFlags: {
+ do_set(ge, inst->GetArg(0), inst);
+ break;
+ }
+ case IR::Opcode::A32GetGEFlags: {
+ do_get(ge, inst);
+ break;
+ }
+ case IR::Opcode::A32SetGEFlagsCompressed: {
+ ge = {.set_not_required = true};
+ break;
+ }
+ case IR::Opcode::A32OrQFlag: {
+ break;
+ }
+ default: {
+ if (inst->ReadsFromCPSR() || inst->WritesToCPSR()) {
+ nzcvq = {};
+ nzcv = {};
+ nz = {};
+ c_flag = {};
+ ge = {};
+ }
+ break;
+ }
+ }
+ }
+}
+
+void RegisterPass(IR::Block& block) {
+ using Iterator = IR::Block::iterator;
+
+ struct RegInfo {
+ IR::Value register_value;
+ std::optional<Iterator> last_set_instruction;
+ };
+ std::array<RegInfo, 15> reg_info;
+
+ const auto do_get = [](RegInfo& info, Iterator get_inst) {
+ if (info.register_value.IsEmpty()) {
+ info.register_value = IR::Value(&*get_inst);
+ return;
+ }
+ get_inst->ReplaceUsesWith(info.register_value);
+ };
+
+ const auto do_set = [](RegInfo& info, IR::Value value, Iterator set_inst) {
+ if (info.last_set_instruction) {
+ (*info.last_set_instruction)->Invalidate();
+ }
+ info = {
+ .register_value = value,
+ .last_set_instruction = set_inst,
+ };
+ };
+
+ enum class ExtValueType {
+ Empty,
+ Single,
+ Double,
+ VectorDouble,
+ VectorQuad,
+ };
+ struct ExtRegInfo {
+ ExtValueType value_type = {};
+ IR::Value register_value;
+ std::optional<Iterator> last_set_instruction;
+ };
+ std::array<ExtRegInfo, 64> ext_reg_info;
+
+ const auto do_ext_get = [](ExtValueType type, std::initializer_list<std::reference_wrapper<ExtRegInfo>> infos, Iterator get_inst) {
+ if (!std::all_of(infos.begin(), infos.end(), [type](const auto& info) { return info.get().value_type == type; })) {
+ for (auto& info : infos) {
+ info.get() = {
+ .value_type = type,
+ .register_value = IR::Value(&*get_inst),
+ .last_set_instruction = std::nullopt,
+ };
+ }
+ return;
+ }
+ get_inst->ReplaceUsesWith(std::data(infos)[0].get().register_value);
+ };
+
+ const auto do_ext_set = [](ExtValueType type, std::initializer_list<std::reference_wrapper<ExtRegInfo>> infos, IR::Value value, Iterator set_inst) {
+ if (std::all_of(infos.begin(), infos.end(), [type](const auto& info) { return info.get().value_type == type; })) {
+ if (std::data(infos)[0].get().last_set_instruction) {
+ (*std::data(infos)[0].get().last_set_instruction)->Invalidate();
+ }
+ }
+ for (auto& info : infos) {
+ info.get() = {
+ .value_type = type,
+ .register_value = value,
+ .last_set_instruction = set_inst,
+ };
+ }
+ };
+
+ // Location and version don't matter here.
+ A32::IREmitter ir{block, A32::LocationDescriptor{block.Location()}, {}};
+
+ for (auto inst = block.begin(); inst != block.end(); ++inst) {
+ switch (inst->GetOpcode()) {
+ case IR::Opcode::A32GetRegister: {
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+ ASSERT(reg != A32::Reg::PC);
+ const size_t reg_index = static_cast<size_t>(reg);
+ do_get(reg_info[reg_index], inst);
+ break;
+ }
+ case IR::Opcode::A32SetRegister: {
+ const A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+ if (reg == A32::Reg::PC) {
+ break;
+ }
+ const auto reg_index = static_cast<size_t>(reg);
+ do_set(reg_info[reg_index], inst->GetArg(1), inst);
+ break;
+ }
+ case IR::Opcode::A32GetExtendedRegister32: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ do_ext_get(ExtValueType::Single, {ext_reg_info[reg_index]}, inst);
+ break;
+ }
+ case IR::Opcode::A32SetExtendedRegister32: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ do_ext_set(ExtValueType::Single, {ext_reg_info[reg_index]}, inst->GetArg(1), inst);
+ break;
+ }
+ case IR::Opcode::A32GetExtendedRegister64: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ do_ext_get(ExtValueType::Double,
+ {
+ ext_reg_info[reg_index * 2 + 0],
+ ext_reg_info[reg_index * 2 + 1],
+ },
+ inst);
+ break;
+ }
+ case IR::Opcode::A32SetExtendedRegister64: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ do_ext_set(ExtValueType::Double,
+ {
+ ext_reg_info[reg_index * 2 + 0],
+ ext_reg_info[reg_index * 2 + 1],
+ },
+ inst->GetArg(1),
+ inst);
+ break;
+ }
+ case IR::Opcode::A32GetVector: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ if (A32::IsDoubleExtReg(reg)) {
+ do_ext_get(ExtValueType::VectorDouble,
+ {
+ ext_reg_info[reg_index * 2 + 0],
+ ext_reg_info[reg_index * 2 + 1],
+ },
+ inst);
+ } else {
+ DEBUG_ASSERT(A32::IsQuadExtReg(reg));
+ do_ext_get(ExtValueType::VectorQuad,
+ {
+ ext_reg_info[reg_index * 4 + 0],
+ ext_reg_info[reg_index * 4 + 1],
+ ext_reg_info[reg_index * 4 + 2],
+ ext_reg_info[reg_index * 4 + 3],
+ },
+ inst);
+ }
+ break;
+ }
+ case IR::Opcode::A32SetVector: {
+ const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+ const size_t reg_index = A32::RegNumber(reg);
+ if (A32::IsDoubleExtReg(reg)) {
+ ir.SetInsertionPointAfter(inst);
+ const IR::U128 stored_value = ir.VectorZeroUpper(IR::U128{inst->GetArg(1)});
+ do_ext_set(ExtValueType::VectorDouble,
+ {
+ ext_reg_info[reg_index * 2 + 0],
+ ext_reg_info[reg_index * 2 + 1],
+ },
+ stored_value,
+ inst);
+ } else {
+ DEBUG_ASSERT(A32::IsQuadExtReg(reg));
+ do_ext_set(ExtValueType::VectorQuad,
+ {
+ ext_reg_info[reg_index * 4 + 0],
+ ext_reg_info[reg_index * 4 + 1],
+ ext_reg_info[reg_index * 4 + 2],
+ ext_reg_info[reg_index * 4 + 3],
+ },
+ inst->GetArg(1),
+ inst);
+ }
+ break;
+ }
+ default: {
+ if (inst->ReadsFromCoreRegister() || inst->WritesToCoreRegister()) {
+ reg_info = {};
+ ext_reg_info = {};
+ }
+ break;
+ }
+ }
+ }
+}
+
+} // namespace
+
+void A32GetSetElimination(IR::Block& block, A32GetSetEliminationOptions) {
+ FlagsPass(block);
+ RegisterPass(block);
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp
new file mode 100644
index 0000000000..79d9769520
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp
@@ -0,0 +1,57 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/frontend/A64/a64_ir_emitter.h"
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf) {
+ if (conf.hook_data_cache_operations) {
+ return;
+ }
+
+ for (auto& inst : block) {
+ if (inst.GetOpcode() != IR::Opcode::A64DataCacheOperationRaised) {
+ continue;
+ }
+
+ const auto op = static_cast<A64::DataCacheOperation>(inst.GetArg(1).GetU64());
+ if (op == A64::DataCacheOperation::ZeroByVA) {
+ A64::IREmitter ir{block};
+ ir.current_location = A64::LocationDescriptor{IR::LocationDescriptor{inst.GetArg(0).GetU64()}};
+ ir.SetInsertionPointBefore(&inst);
+
+ size_t bytes = 4 << static_cast<size_t>(conf.dczid_el0 & 0b1111);
+ IR::U64 addr{inst.GetArg(2)};
+
+ const IR::U128 zero_u128 = ir.ZeroExtendToQuad(ir.Imm64(0));
+ while (bytes >= 16) {
+ ir.WriteMemory128(addr, zero_u128, IR::AccType::DCZVA);
+ addr = ir.Add(addr, ir.Imm64(16));
+ bytes -= 16;
+ }
+
+ while (bytes >= 8) {
+ ir.WriteMemory64(addr, ir.Imm64(0), IR::AccType::DCZVA);
+ addr = ir.Add(addr, ir.Imm64(8));
+ bytes -= 8;
+ }
+
+ while (bytes >= 4) {
+ ir.WriteMemory32(addr, ir.Imm32(0), IR::AccType::DCZVA);
+ addr = ir.Add(addr, ir.Imm64(4));
+ bytes -= 4;
+ }
+ }
+ inst.Invalidate();
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp
new file mode 100644
index 0000000000..f49201562a
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp
@@ -0,0 +1,161 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <array>
+
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A64/a64_types.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::Optimization {
+
+void A64GetSetElimination(IR::Block& block) {
+ using Iterator = IR::Block::iterator;
+
+ enum class TrackingType {
+ W,
+ X,
+ S,
+ D,
+ Q,
+ SP,
+ NZCV,
+ NZCVRaw,
+ };
+ struct RegisterInfo {
+ IR::Value register_value;
+ TrackingType tracking_type;
+ bool set_instruction_present = false;
+ Iterator last_set_instruction;
+ };
+ std::array<RegisterInfo, 31> reg_info;
+ std::array<RegisterInfo, 32> vec_info;
+ RegisterInfo sp_info;
+ RegisterInfo nzcv_info;
+
+ const auto do_set = [&block](RegisterInfo& info, IR::Value value, Iterator set_inst, TrackingType tracking_type) {
+ if (info.set_instruction_present) {
+ info.last_set_instruction->Invalidate();
+ block.Instructions().erase(info.last_set_instruction);
+ }
+
+ info.register_value = value;
+ info.tracking_type = tracking_type;
+ info.set_instruction_present = true;
+ info.last_set_instruction = set_inst;
+ };
+
+ const auto do_get = [](RegisterInfo& info, Iterator get_inst, TrackingType tracking_type) {
+ const auto do_nothing = [&] {
+ info = {};
+ info.register_value = IR::Value(&*get_inst);
+ info.tracking_type = tracking_type;
+ };
+
+ if (info.register_value.IsEmpty()) {
+ do_nothing();
+ return;
+ }
+
+ if (info.tracking_type == tracking_type) {
+ get_inst->ReplaceUsesWith(info.register_value);
+ return;
+ }
+
+ do_nothing();
+ };
+
+ for (auto inst = block.begin(); inst != block.end(); ++inst) {
+ switch (inst->GetOpcode()) {
+ case IR::Opcode::A64GetW: {
+ const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef());
+ do_get(reg_info.at(index), inst, TrackingType::W);
+ break;
+ }
+ case IR::Opcode::A64GetX: {
+ const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef());
+ do_get(reg_info.at(index), inst, TrackingType::X);
+ break;
+ }
+ case IR::Opcode::A64GetS: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_get(vec_info.at(index), inst, TrackingType::S);
+ break;
+ }
+ case IR::Opcode::A64GetD: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_get(vec_info.at(index), inst, TrackingType::D);
+ break;
+ }
+ case IR::Opcode::A64GetQ: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_get(vec_info.at(index), inst, TrackingType::Q);
+ break;
+ }
+ case IR::Opcode::A64GetSP: {
+ do_get(sp_info, inst, TrackingType::SP);
+ break;
+ }
+ case IR::Opcode::A64GetNZCVRaw: {
+ do_get(nzcv_info, inst, TrackingType::NZCVRaw);
+ break;
+ }
+ case IR::Opcode::A64SetW: {
+ const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef());
+ do_set(reg_info.at(index), inst->GetArg(1), inst, TrackingType::W);
+ break;
+ }
+ case IR::Opcode::A64SetX: {
+ const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef());
+ do_set(reg_info.at(index), inst->GetArg(1), inst, TrackingType::X);
+ break;
+ }
+ case IR::Opcode::A64SetS: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::S);
+ break;
+ }
+ case IR::Opcode::A64SetD: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::D);
+ break;
+ }
+ case IR::Opcode::A64SetQ: {
+ const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef());
+ do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::Q);
+ break;
+ }
+ case IR::Opcode::A64SetSP: {
+ do_set(sp_info, inst->GetArg(0), inst, TrackingType::SP);
+ break;
+ }
+ case IR::Opcode::A64SetNZCV: {
+ do_set(nzcv_info, inst->GetArg(0), inst, TrackingType::NZCV);
+ break;
+ }
+ case IR::Opcode::A64SetNZCVRaw: {
+ do_set(nzcv_info, inst->GetArg(0), inst, TrackingType::NZCVRaw);
+ break;
+ }
+ default: {
+ if (inst->ReadsFromCPSR() || inst->WritesToCPSR()) {
+ nzcv_info = {};
+ }
+ if (inst->ReadsFromCoreRegister() || inst->WritesToCoreRegister()) {
+ reg_info = {};
+ vec_info = {};
+ sp_info = {};
+ }
+ break;
+ }
+ }
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp
new file mode 100644
index 0000000000..00a0e1b672
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp
@@ -0,0 +1,54 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <boost/variant/get.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/frontend/A64/a64_location_descriptor.h"
+#include "dynarmic/frontend/A64/translate/a64_translate.h"
+#include "dynarmic/interface/A64/config.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+void A64MergeInterpretBlocksPass(IR::Block& block, A64::UserCallbacks* cb) {
+ const auto is_interpret_instruction = [cb](A64::LocationDescriptor location) {
+ const auto instruction = cb->MemoryReadCode(location.PC());
+ if (!instruction)
+ return false;
+
+ IR::Block new_block{location};
+ A64::TranslateSingleInstruction(new_block, location, *instruction);
+
+ if (!new_block.Instructions().empty())
+ return false;
+
+ const IR::Terminal terminal = new_block.GetTerminal();
+ if (auto term = boost::get<IR::Term::Interpret>(&terminal)) {
+ return term->next == location;
+ }
+
+ return false;
+ };
+
+ IR::Terminal terminal = block.GetTerminal();
+ auto term = boost::get<IR::Term::Interpret>(&terminal);
+ if (!term)
+ return;
+
+ A64::LocationDescriptor location{term->next};
+ size_t num_instructions = 1;
+
+ while (is_interpret_instruction(location.AdvancePC(static_cast<int>(num_instructions * 4)))) {
+ num_instructions++;
+ }
+
+ term->num_instructions = num_instructions;
+ block.ReplaceTerminal(terminal);
+ block.CycleCount() += num_instructions - 1;
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp
new file mode 100644
index 0000000000..83ed84998d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp
@@ -0,0 +1,556 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <optional>
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/rotate.hpp>
+#include <mcl/bit/swap.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/common/safe_ops.h"
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/ir_emitter.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+using Op = Dynarmic::IR::Opcode;
+
+namespace {
+
+// Tiny helper to avoid the need to store based off the opcode
+// bit size all over the place within folding functions.
+void ReplaceUsesWith(IR::Inst& inst, bool is_32_bit, u64 value) {
+ if (is_32_bit) {
+ inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
+ } else {
+ inst.ReplaceUsesWith(IR::Value{value});
+ }
+}
+
+IR::Value Value(bool is_32_bit, u64 value) {
+ return is_32_bit ? IR::Value{static_cast<u32>(value)} : IR::Value{value};
+}
+
+template<typename ImmFn>
+bool FoldCommutative(IR::Inst& inst, bool is_32_bit, ImmFn imm_fn) {
+ const auto lhs = inst.GetArg(0);
+ const auto rhs = inst.GetArg(1);
+
+ const bool is_lhs_immediate = lhs.IsImmediate();
+ const bool is_rhs_immediate = rhs.IsImmediate();
+
+ if (is_lhs_immediate && is_rhs_immediate) {
+ const u64 result = imm_fn(lhs.GetImmediateAsU64(), rhs.GetImmediateAsU64());
+ ReplaceUsesWith(inst, is_32_bit, result);
+ return false;
+ }
+
+ if (is_lhs_immediate && !is_rhs_immediate) {
+ const IR::Inst* rhs_inst = rhs.GetInstRecursive();
+ if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->GetArg(1).IsImmediate()) {
+ const u64 combined = imm_fn(lhs.GetImmediateAsU64(), rhs_inst->GetArg(1).GetImmediateAsU64());
+ inst.SetArg(0, rhs_inst->GetArg(0));
+ inst.SetArg(1, Value(is_32_bit, combined));
+ } else {
+ // Normalize
+ inst.SetArg(0, rhs);
+ inst.SetArg(1, lhs);
+ }
+ }
+
+ if (!is_lhs_immediate && is_rhs_immediate) {
+ const IR::Inst* lhs_inst = lhs.GetInstRecursive();
+ if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->GetArg(1).IsImmediate()) {
+ const u64 combined = imm_fn(rhs.GetImmediateAsU64(), lhs_inst->GetArg(1).GetImmediateAsU64());
+ inst.SetArg(0, lhs_inst->GetArg(0));
+ inst.SetArg(1, Value(is_32_bit, combined));
+ }
+ }
+
+ return true;
+}
+
+void FoldAdd(IR::Inst& inst, bool is_32_bit) {
+ const auto lhs = inst.GetArg(0);
+ const auto rhs = inst.GetArg(1);
+ const auto carry = inst.GetArg(2);
+
+ if (lhs.IsImmediate() && !rhs.IsImmediate()) {
+ // Normalize
+ inst.SetArg(0, rhs);
+ inst.SetArg(1, lhs);
+ FoldAdd(inst, is_32_bit);
+ return;
+ }
+
+ if (inst.HasAssociatedPseudoOperation()) {
+ return;
+ }
+
+ if (!lhs.IsImmediate() && rhs.IsImmediate()) {
+ const IR::Inst* lhs_inst = lhs.GetInstRecursive();
+ if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->GetArg(1).IsImmediate() && lhs_inst->GetArg(2).IsImmediate()) {
+ const u64 combined = rhs.GetImmediateAsU64() + lhs_inst->GetArg(1).GetImmediateAsU64() + lhs_inst->GetArg(2).GetU1();
+ if (combined == 0) {
+ inst.ReplaceUsesWith(lhs_inst->GetArg(0));
+ return;
+ }
+ inst.SetArg(0, lhs_inst->GetArg(0));
+ inst.SetArg(1, Value(is_32_bit, combined));
+ return;
+ }
+ if (rhs.IsZero() && carry.IsZero()) {
+ inst.ReplaceUsesWith(lhs);
+ return;
+ }
+ }
+
+ if (inst.AreAllArgsImmediates()) {
+ const u64 result = lhs.GetImmediateAsU64() + rhs.GetImmediateAsU64() + carry.GetU1();
+ ReplaceUsesWith(inst, is_32_bit, result);
+ return;
+ }
+}
+
+// Folds AND operations based on the following:
+//
+// 1. imm_x & imm_y -> result
+// 2. x & 0 -> 0
+// 3. 0 & y -> 0
+// 4. x & y -> y (where x has all bits set to 1)
+// 5. x & y -> x (where y has all bits set to 1)
+//
+void FoldAND(IR::Inst& inst, bool is_32_bit) {
+ if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a & b; })) {
+ const auto rhs = inst.GetArg(1);
+ if (rhs.IsZero()) {
+ ReplaceUsesWith(inst, is_32_bit, 0);
+ } else if (rhs.HasAllBitsSet()) {
+ inst.ReplaceUsesWith(inst.GetArg(0));
+ }
+ }
+}
+
+// Folds byte reversal opcodes based on the following:
+//
+// 1. imm -> swap(imm)
+//
+void FoldByteReverse(IR::Inst& inst, Op op) {
+ const auto operand = inst.GetArg(0);
+
+ if (!operand.IsImmediate()) {
+ return;
+ }
+
+ if (op == Op::ByteReverseWord) {
+ const u32 result = mcl::bit::swap_bytes_32(static_cast<u32>(operand.GetImmediateAsU64()));
+ inst.ReplaceUsesWith(IR::Value{result});
+ } else if (op == Op::ByteReverseHalf) {
+ const u16 result = mcl::bit::swap_bytes_16(static_cast<u16>(operand.GetImmediateAsU64()));
+ inst.ReplaceUsesWith(IR::Value{result});
+ } else {
+ const u64 result = mcl::bit::swap_bytes_64(operand.GetImmediateAsU64());
+ inst.ReplaceUsesWith(IR::Value{result});
+ }
+}
+
+// Folds division operations based on the following:
+//
+// 1. x / 0 -> 0 (NOTE: This is an ARM-specific behavior defined in the architecture reference manual)
+// 2. imm_x / imm_y -> result
+// 3. x / 1 -> x
+//
+void FoldDivide(IR::Inst& inst, bool is_32_bit, bool is_signed) {
+ const auto rhs = inst.GetArg(1);
+
+ if (rhs.IsZero()) {
+ ReplaceUsesWith(inst, is_32_bit, 0);
+ return;
+ }
+
+ const auto lhs = inst.GetArg(0);
+ if (lhs.IsImmediate() && rhs.IsImmediate()) {
+ if (is_signed) {
+ const s64 result = lhs.GetImmediateAsS64() / rhs.GetImmediateAsS64();
+ ReplaceUsesWith(inst, is_32_bit, static_cast<u64>(result));
+ } else {
+ const u64 result = lhs.GetImmediateAsU64() / rhs.GetImmediateAsU64();
+ ReplaceUsesWith(inst, is_32_bit, result);
+ }
+ } else if (rhs.IsUnsignedImmediate(1)) {
+ inst.ReplaceUsesWith(IR::Value{lhs});
+ }
+}
+
+// Folds EOR operations based on the following:
+//
+// 1. imm_x ^ imm_y -> result
+// 2. x ^ 0 -> x
+// 3. 0 ^ y -> y
+//
+void FoldEOR(IR::Inst& inst, bool is_32_bit) {
+ if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a ^ b; })) {
+ const auto rhs = inst.GetArg(1);
+ if (rhs.IsZero()) {
+ inst.ReplaceUsesWith(inst.GetArg(0));
+ }
+ }
+}
+
+void FoldLeastSignificantByte(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const auto operand = inst.GetArg(0);
+ inst.ReplaceUsesWith(IR::Value{static_cast<u8>(operand.GetImmediateAsU64())});
+}
+
+void FoldLeastSignificantHalf(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const auto operand = inst.GetArg(0);
+ inst.ReplaceUsesWith(IR::Value{static_cast<u16>(operand.GetImmediateAsU64())});
+}
+
+void FoldLeastSignificantWord(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const auto operand = inst.GetArg(0);
+ inst.ReplaceUsesWith(IR::Value{static_cast<u32>(operand.GetImmediateAsU64())});
+}
+
+void FoldMostSignificantBit(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const auto operand = inst.GetArg(0);
+ inst.ReplaceUsesWith(IR::Value{(operand.GetImmediateAsU64() >> 31) != 0});
+}
+
+void FoldMostSignificantWord(IR::Inst& inst) {
+ IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp);
+
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const auto operand = inst.GetArg(0);
+ if (carry_inst) {
+ carry_inst->ReplaceUsesWith(IR::Value{mcl::bit::get_bit<31>(operand.GetImmediateAsU64())});
+ }
+ inst.ReplaceUsesWith(IR::Value{static_cast<u32>(operand.GetImmediateAsU64() >> 32)});
+}
+
+// Folds multiplication operations based on the following:
+//
+// 1. imm_x * imm_y -> result
+// 2. x * 0 -> 0
+// 3. 0 * y -> 0
+// 4. x * 1 -> x
+// 5. 1 * y -> y
+//
+void FoldMultiply(IR::Inst& inst, bool is_32_bit) {
+ if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a * b; })) {
+ const auto rhs = inst.GetArg(1);
+ if (rhs.IsZero()) {
+ ReplaceUsesWith(inst, is_32_bit, 0);
+ } else if (rhs.IsUnsignedImmediate(1)) {
+ inst.ReplaceUsesWith(inst.GetArg(0));
+ }
+ }
+}
+
+// Folds NOT operations if the contained value is an immediate.
+void FoldNOT(IR::Inst& inst, bool is_32_bit) {
+ const auto operand = inst.GetArg(0);
+
+ if (!operand.IsImmediate()) {
+ return;
+ }
+
+ const u64 result = ~operand.GetImmediateAsU64();
+ ReplaceUsesWith(inst, is_32_bit, result);
+}
+
+// Folds OR operations based on the following:
+//
+// 1. imm_x | imm_y -> result
+// 2. x | 0 -> x
+// 3. 0 | y -> y
+//
+void FoldOR(IR::Inst& inst, bool is_32_bit) {
+ if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a | b; })) {
+ const auto rhs = inst.GetArg(1);
+ if (rhs.IsZero()) {
+ inst.ReplaceUsesWith(inst.GetArg(0));
+ }
+ }
+}
+
+bool FoldShifts(IR::Inst& inst) {
+ IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp);
+
+ // The 32-bit variants can contain 3 arguments, while the
+ // 64-bit variants only contain 2.
+ if (inst.NumArgs() == 3 && !carry_inst) {
+ inst.SetArg(2, IR::Value(false));
+ }
+
+ const auto shift_amount = inst.GetArg(1);
+
+ if (shift_amount.IsZero()) {
+ if (carry_inst) {
+ carry_inst->ReplaceUsesWith(inst.GetArg(2));
+ }
+ inst.ReplaceUsesWith(inst.GetArg(0));
+ return false;
+ }
+
+ if (inst.NumArgs() == 3 && shift_amount.IsImmediate() && !shift_amount.IsZero()) {
+ inst.SetArg(2, IR::Value(false));
+ }
+
+ if (!inst.AreAllArgsImmediates() || carry_inst) {
+ return false;
+ }
+
+ return true;
+}
+
+void FoldSignExtendXToWord(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const s64 value = inst.GetArg(0).GetImmediateAsS64();
+ inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
+}
+
+void FoldSignExtendXToLong(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const s64 value = inst.GetArg(0).GetImmediateAsS64();
+ inst.ReplaceUsesWith(IR::Value{static_cast<u64>(value)});
+}
+
+void FoldSub(IR::Inst& inst, bool is_32_bit) {
+ if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) {
+ return;
+ }
+
+ const auto lhs = inst.GetArg(0);
+ const auto rhs = inst.GetArg(1);
+ const auto carry = inst.GetArg(2);
+
+ const u64 result = lhs.GetImmediateAsU64() + (~rhs.GetImmediateAsU64()) + carry.GetU1();
+ ReplaceUsesWith(inst, is_32_bit, result);
+}
+
+void FoldZeroExtendXToWord(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const u64 value = inst.GetArg(0).GetImmediateAsU64();
+ inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
+}
+
+void FoldZeroExtendXToLong(IR::Inst& inst) {
+ if (!inst.AreAllArgsImmediates()) {
+ return;
+ }
+
+ const u64 value = inst.GetArg(0).GetImmediateAsU64();
+ inst.ReplaceUsesWith(IR::Value{value});
+}
+} // Anonymous namespace
+
+void ConstantPropagation(IR::Block& block) {
+ for (auto& inst : block) {
+ const auto opcode = inst.GetOpcode();
+
+ switch (opcode) {
+ case Op::LeastSignificantWord:
+ FoldLeastSignificantWord(inst);
+ break;
+ case Op::MostSignificantWord:
+ FoldMostSignificantWord(inst);
+ break;
+ case Op::LeastSignificantHalf:
+ FoldLeastSignificantHalf(inst);
+ break;
+ case Op::LeastSignificantByte:
+ FoldLeastSignificantByte(inst);
+ break;
+ case Op::MostSignificantBit:
+ FoldMostSignificantBit(inst);
+ break;
+ case Op::IsZero32:
+ if (inst.AreAllArgsImmediates()) {
+ inst.ReplaceUsesWith(IR::Value{inst.GetArg(0).GetU32() == 0});
+ }
+ break;
+ case Op::IsZero64:
+ if (inst.AreAllArgsImmediates()) {
+ inst.ReplaceUsesWith(IR::Value{inst.GetArg(0).GetU64() == 0});
+ }
+ break;
+ case Op::LogicalShiftLeft32:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, true, Safe::LogicalShiftLeft<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::LogicalShiftLeft64:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, false, Safe::LogicalShiftLeft<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::LogicalShiftRight32:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, true, Safe::LogicalShiftRight<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::LogicalShiftRight64:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, false, Safe::LogicalShiftRight<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::ArithmeticShiftRight32:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, true, Safe::ArithmeticShiftRight<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::ArithmeticShiftRight64:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, false, Safe::ArithmeticShiftRight<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::RotateRight32:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, true, mcl::bit::rotate_right<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::RotateRight64:
+ if (FoldShifts(inst)) {
+ ReplaceUsesWith(inst, false, mcl::bit::rotate_right<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8()));
+ }
+ break;
+ case Op::LogicalShiftLeftMasked32:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, true, inst.GetArg(0).GetU32() << (inst.GetArg(1).GetU32() & 0x1f));
+ }
+ break;
+ case Op::LogicalShiftLeftMasked64:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, false, inst.GetArg(0).GetU64() << (inst.GetArg(1).GetU64() & 0x3f));
+ }
+ break;
+ case Op::LogicalShiftRightMasked32:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, true, inst.GetArg(0).GetU32() >> (inst.GetArg(1).GetU32() & 0x1f));
+ }
+ break;
+ case Op::LogicalShiftRightMasked64:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, false, inst.GetArg(0).GetU64() >> (inst.GetArg(1).GetU64() & 0x3f));
+ }
+ break;
+ case Op::ArithmeticShiftRightMasked32:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, true, static_cast<s32>(inst.GetArg(0).GetU32()) >> (inst.GetArg(1).GetU32() & 0x1f));
+ }
+ break;
+ case Op::ArithmeticShiftRightMasked64:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, false, static_cast<s64>(inst.GetArg(0).GetU64()) >> (inst.GetArg(1).GetU64() & 0x3f));
+ }
+ break;
+ case Op::RotateRightMasked32:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, true, mcl::bit::rotate_right<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU32()));
+ }
+ break;
+ case Op::RotateRightMasked64:
+ if (inst.AreAllArgsImmediates()) {
+ ReplaceUsesWith(inst, false, mcl::bit::rotate_right<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU64()));
+ }
+ break;
+ case Op::Add32:
+ case Op::Add64:
+ FoldAdd(inst, opcode == Op::Add32);
+ break;
+ case Op::Sub32:
+ case Op::Sub64:
+ FoldSub(inst, opcode == Op::Sub32);
+ break;
+ case Op::Mul32:
+ case Op::Mul64:
+ FoldMultiply(inst, opcode == Op::Mul32);
+ break;
+ case Op::SignedDiv32:
+ case Op::SignedDiv64:
+ FoldDivide(inst, opcode == Op::SignedDiv32, true);
+ break;
+ case Op::UnsignedDiv32:
+ case Op::UnsignedDiv64:
+ FoldDivide(inst, opcode == Op::UnsignedDiv32, false);
+ break;
+ case Op::And32:
+ case Op::And64:
+ FoldAND(inst, opcode == Op::And32);
+ break;
+ case Op::Eor32:
+ case Op::Eor64:
+ FoldEOR(inst, opcode == Op::Eor32);
+ break;
+ case Op::Or32:
+ case Op::Or64:
+ FoldOR(inst, opcode == Op::Or32);
+ break;
+ case Op::Not32:
+ case Op::Not64:
+ FoldNOT(inst, opcode == Op::Not32);
+ break;
+ case Op::SignExtendByteToWord:
+ case Op::SignExtendHalfToWord:
+ FoldSignExtendXToWord(inst);
+ break;
+ case Op::SignExtendByteToLong:
+ case Op::SignExtendHalfToLong:
+ case Op::SignExtendWordToLong:
+ FoldSignExtendXToLong(inst);
+ break;
+ case Op::ZeroExtendByteToWord:
+ case Op::ZeroExtendHalfToWord:
+ FoldZeroExtendXToWord(inst);
+ break;
+ case Op::ZeroExtendByteToLong:
+ case Op::ZeroExtendHalfToLong:
+ case Op::ZeroExtendWordToLong:
+ FoldZeroExtendXToLong(inst);
+ break;
+ case Op::ByteReverseWord:
+ case Op::ByteReverseHalf:
+ case Op::ByteReverseDual:
+ FoldByteReverse(inst, opcode);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp
new file mode 100644
index 0000000000..0d0de180a8
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp
@@ -0,0 +1,23 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <mcl/iterator/reverse.hpp>
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+void DeadCodeElimination(IR::Block& block) {
+ // We iterate over the instructions in reverse order.
+ // This is because removing an instruction reduces the number of uses for earlier instructions.
+ for (auto& inst : mcl::iterator::reverse(block)) {
+ if (!inst.HasUses() && !inst.MayHaveSideEffects()) {
+ inst.Invalidate();
+ }
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp
new file mode 100644
index 0000000000..e87fcc335b
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp
@@ -0,0 +1,44 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <vector>
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+void IdentityRemovalPass(IR::Block& block) {
+ std::vector<IR::Inst*> to_invalidate;
+
+ auto iter = block.begin();
+ while (iter != block.end()) {
+ IR::Inst& inst = *iter;
+
+ const size_t num_args = inst.NumArgs();
+ for (size_t i = 0; i < num_args; i++) {
+ while (true) {
+ IR::Value arg = inst.GetArg(i);
+ if (!arg.IsIdentity())
+ break;
+ inst.SetArg(i, arg.GetInst()->GetArg(0));
+ }
+ }
+
+ if (inst.GetOpcode() == IR::Opcode::Identity || inst.GetOpcode() == IR::Opcode::Void) {
+ iter = block.Instructions().erase(inst);
+ to_invalidate.push_back(&inst);
+ } else {
+ ++iter;
+ }
+ }
+
+ for (IR::Inst* inst : to_invalidate) {
+ inst->Invalidate();
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h b/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h
new file mode 100644
index 0000000000..5eb1a55100
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h
@@ -0,0 +1,127 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2020 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <optional>
+#include <tuple>
+
+#include <mp/metafunction/apply.h>
+#include <mp/typelist/concat.h>
+#include <mp/typelist/drop.h>
+#include <mp/typelist/get.h>
+#include <mp/typelist/head.h>
+#include <mp/typelist/list.h>
+#include <mp/typelist/prepend.h>
+
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/value.h"
+
+namespace Dynarmic::Optimization::IRMatcher {
+
+struct CaptureValue {
+ using ReturnType = std::tuple<IR::Value>;
+
+ static std::optional<ReturnType> Match(IR::Value value) {
+ return std::tuple(value);
+ }
+};
+
+struct CaptureInst {
+ using ReturnType = std::tuple<IR::Inst*>;
+
+ static std::optional<ReturnType> Match(IR::Value value) {
+ if (value.IsImmediate())
+ return std::nullopt;
+ return std::tuple(value.GetInstRecursive());
+ }
+};
+
+struct CaptureUImm {
+ using ReturnType = std::tuple<u64>;
+
+ static std::optional<ReturnType> Match(IR::Value value) {
+ return std::tuple(value.GetImmediateAsU64());
+ }
+};
+
+struct CaptureSImm {
+ using ReturnType = std::tuple<s64>;
+
+ static std::optional<ReturnType> Match(IR::Value value) {
+ return std::tuple(value.GetImmediateAsS64());
+ }
+};
+
+template<u64 Value>
+struct UImm {
+ using ReturnType = std::tuple<>;
+
+ static std::optional<std::tuple<>> Match(IR::Value value) {
+ if (value.GetImmediateAsU64() == Value)
+ return std::tuple();
+ return std::nullopt;
+ }
+};
+
+template<s64 Value>
+struct SImm {
+ using ReturnType = std::tuple<>;
+
+ static std::optional<std::tuple<>> Match(IR::Value value) {
+ if (value.GetImmediateAsS64() == Value)
+ return std::tuple();
+ return std::nullopt;
+ }
+};
+
+template<IR::Opcode Opcode, typename... Args>
+struct Inst {
+public:
+ using ReturnType = mp::concat<std::tuple<>, typename Args::ReturnType...>;
+
+ static std::optional<ReturnType> Match(const IR::Inst& inst) {
+ if (inst.GetOpcode() != Opcode)
+ return std::nullopt;
+ if (inst.HasAssociatedPseudoOperation())
+ return std::nullopt;
+ return MatchArgs<0>(inst);
+ }
+
+ static std::optional<ReturnType> Match(IR::Value value) {
+ if (value.IsImmediate())
+ return std::nullopt;
+ return Match(*value.GetInstRecursive());
+ }
+
+private:
+ template<size_t I>
+ static auto MatchArgs(const IR::Inst& inst) -> std::optional<mp::apply<mp::concat, mp::prepend<mp::drop<I, mp::list<typename Args::ReturnType...>>, std::tuple<>>>> {
+ if constexpr (I >= sizeof...(Args)) {
+ return std::tuple();
+ } else {
+ using Arg = mp::get<I, mp::list<Args...>>;
+
+ if (const auto arg = Arg::Match(inst.GetArg(I))) {
+ if (const auto rest = MatchArgs<I + 1>(inst)) {
+ return std::tuple_cat(*arg, *rest);
+ }
+ }
+
+ return std::nullopt;
+ }
+ }
+};
+
+inline bool IsSameInst(std::tuple<IR::Inst*, IR::Inst*> t) {
+ return std::get<0>(t) == std::get<1>(t);
+}
+
+inline bool IsSameInst(std::tuple<IR::Inst*, IR::Inst*, IR::Inst*> t) {
+ return std::get<0>(t) == std::get<1>(t) && std::get<0>(t) == std::get<2>(t);
+}
+
+} // namespace Dynarmic::Optimization::IRMatcher
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp
new file mode 100644
index 0000000000..a766bdc83f
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp
@@ -0,0 +1,18 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2023 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+
+namespace Dynarmic::Optimization {
+
+void NamingPass(IR::Block& block) {
+ unsigned name = 1;
+ for (auto& inst : block) {
+ inst.SetName(name++);
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/passes.h b/externals/dynarmic/src/dynarmic/ir/opt/passes.h
new file mode 100644
index 0000000000..703145b556
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/passes.h
@@ -0,0 +1,47 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+namespace Dynarmic::A32 {
+struct UserCallbacks;
+}
+
+namespace Dynarmic::A64 {
+struct UserCallbacks;
+struct UserConfig;
+} // namespace Dynarmic::A64
+
+namespace Dynarmic::IR {
+class Block;
+}
+
+namespace Dynarmic::Optimization {
+
+struct PolyfillOptions {
+ bool sha256 = false;
+ bool vector_multiply_widen = false;
+
+ bool operator==(const PolyfillOptions&) const = default;
+};
+
+struct A32GetSetEliminationOptions {
+ bool convert_nzc_to_nz = false;
+ bool convert_nz_to_nzc = false;
+};
+
+void PolyfillPass(IR::Block& block, const PolyfillOptions& opt);
+void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb);
+void A32GetSetElimination(IR::Block& block, A32GetSetEliminationOptions opt);
+void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf);
+void A64GetSetElimination(IR::Block& block);
+void A64MergeInterpretBlocksPass(IR::Block& block, A64::UserCallbacks* cb);
+void ConstantPropagation(IR::Block& block);
+void DeadCodeElimination(IR::Block& block);
+void IdentityRemovalPass(IR::Block& block);
+void VerificationPass(const IR::Block& block);
+void NamingPass(IR::Block& block);
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
new file mode 100644
index 0000000000..1aa3aea91e
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
@@ -0,0 +1,218 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/ir_emitter.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+namespace {
+
+void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
+ const IR::U128 x = (IR::U128)inst.GetArg(0);
+ const IR::U128 y = (IR::U128)inst.GetArg(1);
+
+ const IR::U128 t = ir.VectorExtract(x, y, 32);
+
+ IR::U128 result = ir.ZeroVector();
+ for (size_t i = 0; i < 4; i++) {
+ const IR::U32 modified_element = [&] {
+ const IR::U32 element = ir.VectorGetElement(32, t, i);
+ const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
+ const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
+ const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
+
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+ }();
+
+ result = ir.VectorSetElement(32, result, i, modified_element);
+ }
+ result = ir.VectorAdd(32, result, x);
+
+ inst.ReplaceUsesWith(result);
+}
+
+void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
+ const IR::U128 x = (IR::U128)inst.GetArg(0);
+ const IR::U128 y = (IR::U128)inst.GetArg(1);
+ const IR::U128 z = (IR::U128)inst.GetArg(2);
+
+ const IR::U128 T0 = ir.VectorExtract(y, z, 32);
+
+ const IR::U128 lower_half = [&] {
+ const IR::U128 T = ir.VectorRotateWholeVectorRight(z, 64);
+ const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
+ const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
+ const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
+ const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
+ const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, x, T0));
+ return ir.VectorZeroUpper(tmp5);
+ }();
+
+ const IR::U64 upper_half = [&] {
+ const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
+ const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
+ const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
+ const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
+
+ // Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
+ const IR::U128 shuffled_d = ir.VectorRotateWholeVectorRight(x, 64);
+ const IR::U128 shuffled_T0 = ir.VectorRotateWholeVectorRight(T0, 64);
+
+ const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
+ return ir.VectorGetElement(64, tmp5, 0);
+ }();
+
+ const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
+
+ inst.ReplaceUsesWith(result);
+}
+
+IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+ return ir.Eor(ir.And(ir.Eor(y, z), x), z);
+}
+
+IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+ return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
+}
+
+IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
+ const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
+ const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
+ const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
+
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
+ const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
+ const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
+ const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
+
+ return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
+ IR::U128 x = (IR::U128)inst.GetArg(0);
+ IR::U128 y = (IR::U128)inst.GetArg(1);
+ const IR::U128 w = (IR::U128)inst.GetArg(2);
+ const bool part1 = inst.GetArg(3).GetU1();
+
+ for (size_t i = 0; i < 4; i++) {
+ const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
+ const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
+ const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
+ const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
+
+ const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
+ const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
+ const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
+ const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
+
+ const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
+ const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
+
+ const IR::U32 t = [&] {
+ const IR::U32 w_element = ir.VectorGetElement(32, w, i);
+ const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
+
+ return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
+ }();
+
+ const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
+ const IR::U32 new_low_y = ir.Add(t, high_x);
+
+ // Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
+ const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96);
+ const IR::U128 shuffled_y = ir.VectorRotateWholeVectorRight(y, 96);
+
+ x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
+ y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
+ }
+
+ inst.ReplaceUsesWith(part1 ? x : y);
+}
+
+template<size_t esize, bool is_signed>
+void PolyfillVectorMultiplyWiden(IR::IREmitter& ir, IR::Inst& inst) {
+ IR::U128 n = (IR::U128)inst.GetArg(0);
+ IR::U128 m = (IR::U128)inst.GetArg(1);
+
+ const IR::U128 wide_n = is_signed ? ir.VectorSignExtend(esize, n) : ir.VectorZeroExtend(esize, n);
+ const IR::U128 wide_m = is_signed ? ir.VectorSignExtend(esize, m) : ir.VectorZeroExtend(esize, m);
+
+ const IR::U128 result = ir.VectorMultiply(esize * 2, wide_n, wide_m);
+
+ inst.ReplaceUsesWith(result);
+}
+
+} // namespace
+
+void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
+ if (polyfill == PolyfillOptions{}) {
+ return;
+ }
+
+ IR::IREmitter ir{block};
+
+ for (auto& inst : block) {
+ ir.SetInsertionPointBefore(&inst);
+
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::SHA256MessageSchedule0:
+ if (polyfill.sha256) {
+ PolyfillSHA256MessageSchedule0(ir, inst);
+ }
+ break;
+ case IR::Opcode::SHA256MessageSchedule1:
+ if (polyfill.sha256) {
+ PolyfillSHA256MessageSchedule1(ir, inst);
+ }
+ break;
+ case IR::Opcode::SHA256Hash:
+ if (polyfill.sha256) {
+ PolyfillSHA256Hash(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplySignedWiden8:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<8, true>(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplySignedWiden16:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<16, true>(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplySignedWiden32:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<32, true>(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplyUnsignedWiden8:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<8, false>(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplyUnsignedWiden16:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<16, false>(ir, inst);
+ }
+ break;
+ case IR::Opcode::VectorMultiplyUnsignedWiden32:
+ if (polyfill.vector_multiply_widen) {
+ PolyfillVectorMultiplyWiden<32, false>(ir, inst);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp
new file mode 100644
index 0000000000..8bc9e6f1f0
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp
@@ -0,0 +1,47 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include <cstdio>
+#include <map>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::Optimization {
+
+void VerificationPass(const IR::Block& block) {
+ for (const auto& inst : block) {
+ for (size_t i = 0; i < inst.NumArgs(); i++) {
+ const IR::Type t1 = inst.GetArg(i).GetType();
+ const IR::Type t2 = IR::GetArgTypeOf(inst.GetOpcode(), i);
+ if (!IR::AreTypesCompatible(t1, t2)) {
+ std::puts(IR::DumpBlock(block).c_str());
+ ASSERT_FALSE("above block failed validation");
+ }
+ }
+ }
+
+ std::map<IR::Inst*, size_t> actual_uses;
+ for (const auto& inst : block) {
+ for (size_t i = 0; i < inst.NumArgs(); i++) {
+ const auto arg = inst.GetArg(i);
+ if (!arg.IsImmediate()) {
+ actual_uses[arg.GetInst()]++;
+ }
+ }
+ }
+
+ for (const auto& pair : actual_uses) {
+ ASSERT(pair.first->UseCount() == pair.second);
+ }
+}
+
+} // namespace Dynarmic::Optimization
diff --git a/externals/dynarmic/src/dynarmic/ir/terminal.h b/externals/dynarmic/src/dynarmic/ir/terminal.h
new file mode 100644
index 0000000000..d437ffd5b6
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/terminal.h
@@ -0,0 +1,131 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <boost/variant.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/cond.h"
+#include "dynarmic/ir/location_descriptor.h"
+
+namespace Dynarmic::IR {
+namespace Term {
+
+struct Invalid {};
+
+/**
+ * This terminal instruction calls the interpreter, starting at `next`.
+ * The interpreter must interpret exactly `num_instructions` instructions.
+ */
+struct Interpret {
+ explicit Interpret(const LocationDescriptor& next_)
+ : next(next_) {}
+ LocationDescriptor next; ///< Location at which interpretation starts.
+ size_t num_instructions = 1;
+};
+
+/**
+ * This terminal instruction returns control to the dispatcher.
+ * The dispatcher will use the current cpu state to determine what comes next.
+ */
+struct ReturnToDispatch {};
+
+/**
+ * This terminal instruction jumps to the basic block described by `next` if we have enough
+ * cycles remaining. If we do not have enough cycles remaining, we return to the
+ * dispatcher, which will return control to the host.
+ */
+struct LinkBlock {
+ explicit LinkBlock(const LocationDescriptor& next_)
+ : next(next_) {}
+ LocationDescriptor next; ///< Location descriptor for next block.
+};
+
+/**
+ * This terminal instruction jumps to the basic block described by `next` unconditionally.
+ * This is an optimization and MUST only be emitted when this is guaranteed not to result
+ * in hanging, even in the face of other optimizations. (In practice, this means that only
+ * forward jumps to short-ish blocks would use this instruction.)
+ * A backend that doesn't support this optimization may choose to implement this exactly
+ * as LinkBlock.
+ */
+struct LinkBlockFast {
+ explicit LinkBlockFast(const LocationDescriptor& next_)
+ : next(next_) {}
+ LocationDescriptor next; ///< Location descriptor for next block.
+};
+
+/**
+ * This terminal instruction checks the top of the Return Stack Buffer against the current
+ * location descriptor. If RSB lookup fails, control is returned to the dispatcher.
+ * This is an optimization for faster function calls. A backend that doesn't support
+ * this optimization or doesn't have a RSB may choose to implement this exactly as
+ * ReturnToDispatch.
+ */
+struct PopRSBHint {};
+
+/**
+ * This terminal instruction performs a lookup of the current location descriptor in the
+ * fast dispatch lookup table. A backend that doesn't support this optimization may choose
+ * to implement this exactly as ReturnToDispatch.
+ */
+struct FastDispatchHint {};
+
+struct If;
+struct CheckBit;
+struct CheckHalt;
+/// A Terminal is the terminal instruction in a MicroBlock.
+using Terminal = boost::variant<
+ Invalid,
+ Interpret,
+ ReturnToDispatch,
+ LinkBlock,
+ LinkBlockFast,
+ PopRSBHint,
+ FastDispatchHint,
+ boost::recursive_wrapper<If>,
+ boost::recursive_wrapper<CheckBit>,
+ boost::recursive_wrapper<CheckHalt>>;
+
+/**
+ * This terminal instruction conditionally executes one terminal or another depending
+ * on the run-time state of the ARM flags.
+ */
+struct If {
+ If(Cond if_, Terminal then_, Terminal else_)
+ : if_(if_), then_(std::move(then_)), else_(std::move(else_)) {}
+ Cond if_;
+ Terminal then_;
+ Terminal else_;
+};
+
+/**
+ * This terminal instruction conditionally executes one terminal or another depending
+ * on the run-time state of the check bit.
+ * then_ is executed if the check bit is non-zero, otherwise else_ is executed.
+ */
+struct CheckBit {
+ CheckBit(Terminal then_, Terminal else_)
+ : then_(std::move(then_)), else_(std::move(else_)) {}
+ Terminal then_;
+ Terminal else_;
+};
+
+/**
+ * This terminal instruction checks if a halt was requested. If it wasn't, else_ is
+ * executed.
+ */
+struct CheckHalt {
+ explicit CheckHalt(Terminal else_)
+ : else_(std::move(else_)) {}
+ Terminal else_;
+};
+
+} // namespace Term
+
+using Term::Terminal;
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/type.cpp b/externals/dynarmic/src/dynarmic/ir/type.cpp
new file mode 100644
index 0000000000..7c0f6bf622
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/type.cpp
@@ -0,0 +1,46 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/type.h"
+
+#include <array>
+#include <ostream>
+#include <string>
+
+namespace Dynarmic::IR {
+
+std::string GetNameOf(Type type) {
+ static constexpr std::array names{
+ "A32Reg", "A32ExtReg",
+ "A64Reg", "A64Vec",
+ "Opaque",
+ "U1", "U8", "U16", "U32", "U64", "U128",
+ "CoprocInfo",
+ "NZCVFlags",
+ "Cond",
+ "Table"};
+
+ const size_t bits = static_cast<size_t>(type);
+ if (bits == 0) {
+ return "Void";
+ }
+
+ std::string result;
+ for (size_t i = 0; i < names.size(); i++) {
+ if ((bits & (size_t(1) << i)) != 0) {
+ if (!result.empty()) {
+ result += '|';
+ }
+ result += names[i];
+ }
+ }
+ return result;
+}
+
+bool AreTypesCompatible(Type t1, Type t2) {
+ return t1 == t2 || t1 == Type::Opaque || t2 == Type::Opaque;
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/type.h b/externals/dynarmic/src/dynarmic/ir/type.h
new file mode 100644
index 0000000000..65fe76dd65
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/type.h
@@ -0,0 +1,60 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <string>
+
+#include <fmt/format.h>
+#include <mcl/stdint.hpp>
+
+namespace Dynarmic::IR {
+
+/**
+ * The intermediate representation is typed. These are the used by our IR.
+ */
+enum class Type {
+ Void = 0,
+ A32Reg = 1 << 0,
+ A32ExtReg = 1 << 1,
+ A64Reg = 1 << 2,
+ A64Vec = 1 << 3,
+ Opaque = 1 << 4,
+ U1 = 1 << 5,
+ U8 = 1 << 6,
+ U16 = 1 << 7,
+ U32 = 1 << 8,
+ U64 = 1 << 9,
+ U128 = 1 << 10,
+ CoprocInfo = 1 << 11,
+ NZCVFlags = 1 << 12,
+ Cond = 1 << 13,
+ Table = 1 << 14,
+ AccType = 1 << 15,
+};
+
+constexpr Type operator|(Type a, Type b) {
+ return static_cast<Type>(static_cast<size_t>(a) | static_cast<size_t>(b));
+}
+
+constexpr Type operator&(Type a, Type b) {
+ return static_cast<Type>(static_cast<size_t>(a) & static_cast<size_t>(b));
+}
+
+/// Get the name of a type.
+std::string GetNameOf(Type type);
+
+/// @returns true if t1 and t2 are compatible types
+bool AreTypesCompatible(Type t1, Type t2);
+
+} // namespace Dynarmic::IR
+
+template<>
+struct fmt::formatter<Dynarmic::IR::Type> : fmt::formatter<std::string> {
+ template<typename FormatContext>
+ auto format(Dynarmic::IR::Type type, FormatContext& ctx) const {
+ return formatter<std::string>::format(Dynarmic::IR::GetNameOf(type), ctx);
+ }
+};
diff --git a/externals/dynarmic/src/dynarmic/ir/value.cpp b/externals/dynarmic/src/dynarmic/ir/value.cpp
new file mode 100644
index 0000000000..20f94cb5f9
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/value.cpp
@@ -0,0 +1,254 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/value.h"
+
+#include <mcl/assert.hpp>
+#include <mcl/bit/bit_field.hpp>
+
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::IR {
+
+Value::Value(Inst* value)
+ : type(Type::Opaque) {
+ inner.inst = value;
+}
+
+Value::Value(A32::Reg value)
+ : type(Type::A32Reg) {
+ inner.imm_a32regref = value;
+}
+
+Value::Value(A32::ExtReg value)
+ : type(Type::A32ExtReg) {
+ inner.imm_a32extregref = value;
+}
+
+Value::Value(A64::Reg value)
+ : type(Type::A64Reg) {
+ inner.imm_a64regref = value;
+}
+
+Value::Value(A64::Vec value)
+ : type(Type::A64Vec) {
+ inner.imm_a64vecref = value;
+}
+
+Value::Value(bool value)
+ : type(Type::U1) {
+ inner.imm_u1 = value;
+}
+
+Value::Value(u8 value)
+ : type(Type::U8) {
+ inner.imm_u8 = value;
+}
+
+Value::Value(u16 value)
+ : type(Type::U16) {
+ inner.imm_u16 = value;
+}
+
+Value::Value(u32 value)
+ : type(Type::U32) {
+ inner.imm_u32 = value;
+}
+
+Value::Value(u64 value)
+ : type(Type::U64) {
+ inner.imm_u64 = value;
+}
+
+Value::Value(CoprocessorInfo value)
+ : type(Type::CoprocInfo) {
+ inner.imm_coproc = value;
+}
+
+Value::Value(Cond value)
+ : type(Type::Cond) {
+ inner.imm_cond = value;
+}
+
+Value::Value(AccType value)
+ : type(Type::AccType) {
+ inner.imm_acctype = value;
+}
+
+Value Value::EmptyNZCVImmediateMarker() {
+ Value result{};
+ result.type = Type::NZCVFlags;
+ return result;
+}
+
+bool Value::IsIdentity() const {
+ if (type == Type::Opaque)
+ return inner.inst->GetOpcode() == Opcode::Identity;
+ return false;
+}
+
+bool Value::IsImmediate() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).IsImmediate();
+ return type != Type::Opaque;
+}
+
+bool Value::IsEmpty() const {
+ return type == Type::Void;
+}
+
+Type Value::GetType() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetType();
+ if (type == Type::Opaque)
+ return inner.inst->GetType();
+ return type;
+}
+
+A32::Reg Value::GetA32RegRef() const {
+ ASSERT(type == Type::A32Reg);
+ return inner.imm_a32regref;
+}
+
+A32::ExtReg Value::GetA32ExtRegRef() const {
+ ASSERT(type == Type::A32ExtReg);
+ return inner.imm_a32extregref;
+}
+
+A64::Reg Value::GetA64RegRef() const {
+ ASSERT(type == Type::A64Reg);
+ return inner.imm_a64regref;
+}
+
+A64::Vec Value::GetA64VecRef() const {
+ ASSERT(type == Type::A64Vec);
+ return inner.imm_a64vecref;
+}
+
+Inst* Value::GetInst() const {
+ ASSERT(type == Type::Opaque);
+ return inner.inst;
+}
+
+Inst* Value::GetInstRecursive() const {
+ ASSERT(type == Type::Opaque);
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetInstRecursive();
+ return inner.inst;
+}
+
+bool Value::GetU1() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetU1();
+ ASSERT(type == Type::U1);
+ return inner.imm_u1;
+}
+
+u8 Value::GetU8() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetU8();
+ ASSERT(type == Type::U8);
+ return inner.imm_u8;
+}
+
+u16 Value::GetU16() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetU16();
+ ASSERT(type == Type::U16);
+ return inner.imm_u16;
+}
+
+u32 Value::GetU32() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetU32();
+ ASSERT(type == Type::U32);
+ return inner.imm_u32;
+}
+
+u64 Value::GetU64() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetU64();
+ ASSERT(type == Type::U64);
+ return inner.imm_u64;
+}
+
+Value::CoprocessorInfo Value::GetCoprocInfo() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetCoprocInfo();
+ ASSERT(type == Type::CoprocInfo);
+ return inner.imm_coproc;
+}
+
+Cond Value::GetCond() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetCond();
+ ASSERT(type == Type::Cond);
+ return inner.imm_cond;
+}
+
+AccType Value::GetAccType() const {
+ if (IsIdentity())
+ return inner.inst->GetArg(0).GetAccType();
+ ASSERT(type == Type::AccType);
+ return inner.imm_acctype;
+}
+
+s64 Value::GetImmediateAsS64() const {
+ ASSERT(IsImmediate());
+
+ switch (GetType()) {
+ case IR::Type::U1:
+ return s64(GetU1());
+ case IR::Type::U8:
+ return s64(mcl::bit::sign_extend<8, u64>(GetU8()));
+ case IR::Type::U16:
+ return s64(mcl::bit::sign_extend<16, u64>(GetU16()));
+ case IR::Type::U32:
+ return s64(mcl::bit::sign_extend<32, u64>(GetU32()));
+ case IR::Type::U64:
+ return s64(GetU64());
+ default:
+ ASSERT_FALSE("GetImmediateAsS64 called on an incompatible Value type.");
+ }
+}
+
+u64 Value::GetImmediateAsU64() const {
+ ASSERT(IsImmediate());
+
+ switch (GetType()) {
+ case IR::Type::U1:
+ return u64(GetU1());
+ case IR::Type::U8:
+ return u64(GetU8());
+ case IR::Type::U16:
+ return u64(GetU16());
+ case IR::Type::U32:
+ return u64(GetU32());
+ case IR::Type::U64:
+ return u64(GetU64());
+ default:
+ ASSERT_FALSE("GetImmediateAsU64 called on an incompatible Value type.");
+ }
+}
+
+bool Value::IsSignedImmediate(s64 value) const {
+ return IsImmediate() && GetImmediateAsS64() == value;
+}
+
+bool Value::IsUnsignedImmediate(u64 value) const {
+ return IsImmediate() && GetImmediateAsU64() == value;
+}
+
+bool Value::HasAllBitsSet() const {
+ return IsSignedImmediate(-1);
+}
+
+bool Value::IsZero() const {
+ return IsUnsignedImmediate(0);
+}
+
+} // namespace Dynarmic::IR
diff --git a/externals/dynarmic/src/dynarmic/ir/value.h b/externals/dynarmic/src/dynarmic/ir/value.h
new file mode 100644
index 0000000000..8f70110a8d
--- /dev/null
+++ b/externals/dynarmic/src/dynarmic/ir/value.h
@@ -0,0 +1,187 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+
+#include <mcl/assert.hpp>
+#include <mcl/stdint.hpp>
+
+#include "dynarmic/ir/type.h"
+
+namespace Dynarmic::A32 {
+enum class ExtReg;
+enum class Reg;
+} // namespace Dynarmic::A32
+
+namespace Dynarmic::A64 {
+enum class Reg;
+enum class Vec;
+} // namespace Dynarmic::A64
+
+namespace Dynarmic::IR {
+
+class Inst;
+enum class AccType;
+enum class Cond;
+
+/**
+ * A representation of a value in the IR.
+ * A value may either be an immediate or the result of a microinstruction.
+ */
+class Value {
+public:
+ using CoprocessorInfo = std::array<u8, 8>;
+
+ Value()
+ : type(Type::Void) {}
+ explicit Value(Inst* value);
+ explicit Value(A32::Reg value);
+ explicit Value(A32::ExtReg value);
+ explicit Value(A64::Reg value);
+ explicit Value(A64::Vec value);
+ explicit Value(bool value);
+ explicit Value(u8 value);
+ explicit Value(u16 value);
+ explicit Value(u32 value);
+ explicit Value(u64 value);
+ explicit Value(CoprocessorInfo value);
+ explicit Value(Cond value);
+ explicit Value(AccType value);
+
+ static Value EmptyNZCVImmediateMarker();
+
+ bool IsIdentity() const;
+ bool IsEmpty() const;
+ bool IsImmediate() const;
+ Type GetType() const;
+
+ Inst* GetInst() const;
+ Inst* GetInstRecursive() const;
+ A32::Reg GetA32RegRef() const;
+ A32::ExtReg GetA32ExtRegRef() const;
+ A64::Reg GetA64RegRef() const;
+ A64::Vec GetA64VecRef() const;
+ bool GetU1() const;
+ u8 GetU8() const;
+ u16 GetU16() const;
+ u32 GetU32() const;
+ u64 GetU64() const;
+ CoprocessorInfo GetCoprocInfo() const;
+ Cond GetCond() const;
+ AccType GetAccType() const;
+
+ /**
+ * Retrieves the immediate of a Value instance as a signed 64-bit value.
+ *
+ * @pre The value contains either a U1, U8, U16, U32, or U64 value.
+ * Breaking this precondition will cause an assertion to be invoked.
+ */
+ s64 GetImmediateAsS64() const;
+
+ /**
+ * Retrieves the immediate of a Value instance as an unsigned 64-bit value.
+ *
+ * @pre The value contains either a U1, U8, U16, U32, or U64 value.
+ * Breaking this precondition will cause an assertion to be invoked.
+ */
+ u64 GetImmediateAsU64() const;
+
+ /**
+ * Determines whether or not the contained value matches the provided signed one.
+ *
+ * Note that this function will always return false if the contained
+ * value is not a a constant value. In other words, if IsImmediate()
+ * would return false on an instance, then so will this function.
+ *
+ * @param value The value to check against the contained value.
+ */
+ bool IsSignedImmediate(s64 value) const;
+
+ /**
+ * Determines whether or not the contained value matches the provided unsigned one.
+ *
+ * Note that this function will always return false if the contained
+ * value is not a a constant value. In other words, if IsImmediate()
+ * would return false on an instance, then so will this function.
+ *
+ * @param value The value to check against the contained value.
+ */
+ bool IsUnsignedImmediate(u64 value) const;
+
+ /**
+ * Determines whether or not the contained constant value has all bits set.
+ *
+ * @pre The value contains either a U1, U8, U16, U32, or U64 value.
+ * Breaking this precondition will cause an assertion to be invoked.
+ */
+ bool HasAllBitsSet() const;
+
+ /**
+ * Whether or not the current value contains a representation of zero.
+ *
+ * Note that this function will always return false if the contained
+ * value is not a a constant value. In other words, if IsImmediate()
+ * would return false on an instance, then so will this function.
+ */
+ bool IsZero() const;
+
+private:
+ Type type;
+
+ union {
+ Inst* inst; // type == Type::Opaque
+ A32::Reg imm_a32regref;
+ A32::ExtReg imm_a32extregref;
+ A64::Reg imm_a64regref;
+ A64::Vec imm_a64vecref;
+ bool imm_u1;
+ u8 imm_u8;
+ u16 imm_u16;
+ u32 imm_u32;
+ u64 imm_u64;
+ CoprocessorInfo imm_coproc;
+ Cond imm_cond;
+ AccType imm_acctype;
+ } inner;
+};
+static_assert(sizeof(Value) <= 2 * sizeof(u64), "IR::Value should be kept small in size");
+
+template<Type type_>
+class TypedValue final : public Value {
+public:
+ TypedValue() = default;
+
+ template<Type other_type, typename = std::enable_if_t<(other_type & type_) != Type::Void>>
+ /* implicit */ TypedValue(const TypedValue<other_type>& value)
+ : Value(value) {
+ ASSERT((value.GetType() & type_) != Type::Void);
+ }
+
+ explicit TypedValue(const Value& value)
+ : Value(value) {
+ ASSERT((value.GetType() & type_) != Type::Void);
+ }
+
+ explicit TypedValue(Inst* inst)
+ : TypedValue(Value(inst)) {}
+};
+
+using U1 = TypedValue<Type::U1>;
+using U8 = TypedValue<Type::U8>;
+using U16 = TypedValue<Type::U16>;
+using U32 = TypedValue<Type::U32>;
+using U64 = TypedValue<Type::U64>;
+using U128 = TypedValue<Type::U128>;
+using U32U64 = TypedValue<Type::U32 | Type::U64>;
+using U16U32U64 = TypedValue<Type::U16 | Type::U32 | Type::U64>;
+using UAny = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;
+using UAnyU128 = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64 | Type::U128>;
+using NZCV = TypedValue<Type::NZCVFlags>;
+using Table = TypedValue<Type::Table>;
+
+} // namespace Dynarmic::IR